API Reference

Public API of bls_release_dates, generated from docstrings (Google style).

Package

`bls_release_dates`

BLS news release scraper for CES, SAE, and QCEW release dates.

`Publication(name, series, index_url, frequency)` `dataclass`

BLS publication: name, series code, index URL, and frequency.

Attributes:

Name	Type	Description
`name`	`str`	Short name (e.g. "ces", "sae", "qcew").
`series`	`str`	BLS series code used in archive URLs (e.g. "empsit", "laus").
`index_url`	`str`	Full URL of the news release archive index page.
`frequency`	`str`	Either "monthly" or "quarterly".

`main()`

Run full pipeline: download, build release_dates, build vintage_dates.

Source code in src/bls_release_dates/__main__.py

def main() -> None:
    """Run full pipeline: download, build release_dates, build vintage_dates."""
    asyncio.run(download_all_publications())

    print("Building release_dates...")
    df = build_dataframe()
    PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.write_parquet(PARQUET_PATH)
    print(f"Wrote {PARQUET_PATH} ({len(df)} rows)")

    print("Building vintage_dates...")
    vdf = build_vintage_dates()
    VINTAGE_DATES_PATH.parent.mkdir(parents=True, exist_ok=True)
    vdf.write_parquet(VINTAGE_DATES_PATH)
    print(f"Wrote {VINTAGE_DATES_PATH} ({len(vdf)} rows)")

`build_dataframe()`

Parse all downloaded HTML files into a release_dates DataFrame.

Source code in src/bls_release_dates/__main__.py

def build_dataframe() -> pl.DataFrame:
    """Parse all downloaded HTML files into a release_dates DataFrame."""
    rows = []
    for pub in PUBLICATIONS:
        pub_dir = DATA_DIR / pub.name
        if not pub_dir.exists():
            continue
        for row in collect_release_dates(pub.name, pub_dir):
            rows.append(row)

    df = pl.DataFrame(
        rows, schema={"publication": pl.Utf8, "ref_date": pl.Date, "vintage_date": pl.Date},
        orient="row",
    ).sort("publication", "ref_date")
    return df

`download_all_publications()` `async`

Download release HTML files for all configured publications.

Source code in src/bls_release_dates/__main__.py

async def download_all_publications() -> None:
    """Download release HTML files for all configured publications."""
    async with httpx.AsyncClient(
        http2=True, follow_redirects=True, timeout=30.0,
    ) as client:
        for pub in PUBLICATIONS:
            print(f"Fetching index for {pub.name}...")
            html = await fetch_index(client, pub.index_url)
            entries = parse_index_page(html, pub.name, pub.series, pub.frequency)
            print(f"  Found {len(entries)} releases for {pub.name}")
            paths = await download_all(entries, pub.name)
            print(f"  Downloaded {len(paths)} new files for {pub.name}")

`read_release_dates(path=None)`

Read release_dates parquet if it exists.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str \| None`	Optional path to the parquet file. Defaults to data/release_dates.parquet relative to the current working directory.	`None`

Returns:

Type	Description
`DataFrame \| None`	Polars DataFrame with columns publication, ref_date, vintage_date, or None
`DataFrame \| None`	if the file has not been created yet.

Source code in src/bls_release_dates/read.py

def read_release_dates(path: Path | str | None = None) -> pl.DataFrame | None:
    """Read release_dates parquet if it exists.

    Args:
        path: Optional path to the parquet file. Defaults to data/release_dates.parquet
            relative to the current working directory.

    Returns:
        Polars DataFrame with columns publication, ref_date, vintage_date, or None
        if the file has not been created yet.
    """
    p = Path(path) if path is not None else PARQUET_PATH
    if not p.exists():
        return None
    return pl.read_parquet(p)

`read_vintage_dates(path=None)`

Read vintage_dates parquet if it exists.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str \| None`	Optional path to the parquet file. Defaults to data/vintage_dates.parquet relative to the current working directory.	`None`

Returns:

Type	Description
`DataFrame \| None`	Polars DataFrame with columns publication, ref_date, vintage_date, revision,
`DataFrame \| None`	benchmark_revision, or None if the file has not been created yet.

Source code in src/bls_release_dates/read.py

def read_vintage_dates(path: Path | str | None = None) -> pl.DataFrame | None:
    """Read vintage_dates parquet if it exists.

    Args:
        path: Optional path to the parquet file. Defaults to data/vintage_dates.parquet
            relative to the current working directory.

    Returns:
        Polars DataFrame with columns publication, ref_date, vintage_date, revision,
        benchmark_revision, or None if the file has not been created yet.
    """
    p = Path(path) if path is not None else VINTAGE_DATES_PATH
    if not p.exists():
        return None
    return pl.read_parquet(p)

Configuration

`bls_release_dates.config`

Publication definitions and paths.

`Publication(name, series, index_url, frequency)` `dataclass`

BLS publication: name, series code, index URL, and frequency.

Attributes:

Name	Type	Description
`name`	`str`	Short name (e.g. "ces", "sae", "qcew").
`series`	`str`	BLS series code used in archive URLs (e.g. "empsit", "laus").
`index_url`	`str`	Full URL of the news release archive index page.
`frequency`	`str`	Either "monthly" or "quarterly".

Parser

`bls_release_dates.parser`

Extract release (vintage) date from downloaded BLS release HTML files.

`parse_vintage_date(html_content)`

Extract release (vintage) date from embargo line in HTML.

Parameters:

Name	Type	Description	Default
`html_content`	`str`	Raw HTML of a BLS release page.	required

Returns:

Type	Description
`date \| None`	The release (vintage) date if found in the embargo line, None otherwise.

Source code in src/bls_release_dates/parser.py

def parse_vintage_date(html_content: str) -> date | None:
    """Extract release (vintage) date from embargo line in HTML.

    Args:
        html_content: Raw HTML of a BLS release page.

    Returns:
        The release (vintage) date if found in the embargo line, None otherwise.
    """
    match = VINTAGE_DATE_RE.search(html_content)
    if not match:
        return None
    month_name, day_str, year_str = match.group(1), match.group(2), match.group(3)
    month = MONTH_TO_NUM.get(month_name)
    if month is None:
        return None
    try:
        day = int(day_str)
        year = int(year_str)
        return date(year, month, day)
    except (ValueError, TypeError):
        return None

`parse_ref_from_path(path)`

Parse reference year and month from a release filename.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to a file named like {pub}{yyyy}{mm}.htm (e.g. ces_2010_03.htm).	required

Returns:

Type	Description
`tuple[int, int] \| None`	(year, month) if the stem matches the expected pattern and values are valid,
`tuple[int, int] \| None`	None otherwise. Month is 1-12, year is 2000-2100.

Source code in src/bls_release_dates/parser.py

def parse_ref_from_path(path: Path) -> tuple[int, int] | None:
    """Parse reference year and month from a release filename.

    Args:
        path: Path to a file named like {pub}_{yyyy}_{mm}.htm (e.g. ces_2010_03.htm).

    Returns:
        (year, month) if the stem matches the expected pattern and values are valid,
        None otherwise. Month is 1-12, year is 2000-2100.
    """
    # filename: {pub}_{yyyy}_{mm}.htm
    stem = path.stem
    parts = stem.split("_")
    if len(parts) != 3:
        return None
    try:
        yyyy, mm = int(parts[1]), int(parts[2])
        if 1 <= mm <= 12 and 2000 <= yyyy <= 2100:
            return (yyyy, mm)
    except ValueError:
        pass
    return None

`ref_date_from_year_month(year, month)`

Return the reference date for a given year and month.

The reference date is always the 12th of the reference month.

Parameters:

Name	Type	Description	Default
`year`	`int`	Reference year.	required
`month`	`int`	Reference month (1-12).	required

Returns:

Type	Description
`date`	date(year, month, 12).

Source code in src/bls_release_dates/parser.py

def ref_date_from_year_month(year: int, month: int) -> date:
    """Return the reference date for a given year and month.

    The reference date is always the 12th of the reference month.

    Args:
        year: Reference year.
        month: Reference month (1-12).

    Returns:
        date(year, month, 12).
    """
    return date(year, month, 12)

`parse_release_file(path, publication_name)`

Read a release HTML file and extract publication, ref_date, and vintage_date.

ref_date is the 12th of the reference month (from the filename); vintage_date is parsed from the embargo line in the HTML.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the release .htm file.	required
`publication_name`	`str`	Publication name (e.g. "ces", "sae", "qcew").	required

Returns:

Type	Description
`tuple[str, date, date] \| None`	(publication_name, ref_date, vintage_date) if both dates could be parsed,
`tuple[str, date, date] \| None`	None otherwise.

Source code in src/bls_release_dates/parser.py

def parse_release_file(path: Path, publication_name: str) -> tuple[str, date, date] | None:
    """Read a release HTML file and extract publication, ref_date, and vintage_date.

    ref_date is the 12th of the reference month (from the filename); vintage_date
    is parsed from the embargo line in the HTML.

    Args:
        path: Path to the release .htm file.
        publication_name: Publication name (e.g. "ces", "sae", "qcew").

    Returns:
        (publication_name, ref_date, vintage_date) if both dates could be parsed,
        None otherwise.
    """
    ref = parse_ref_from_path(path)
    if ref is None:
        return None
    ref_year, ref_month = ref
    ref_d = ref_date_from_year_month(ref_year, ref_month)

    try:
        content = path.read_text(encoding="utf-8")
    except OSError:
        return None

    vintage_d = parse_vintage_date(content)
    if vintage_d is None:
        return None

    return (publication_name, ref_d, vintage_d)

`collect_release_dates(publication_name, releases_dir)`

Walk a publication's release directory and yield parsed release rows.

Glob pattern used: {publication_name}_*.htm. Logs a warning and skips files where the vintage date cannot be parsed.

Parameters:

Name	Type	Description	Default
`publication_name`	`str`	Publication name (e.g. "ces", "sae", "qcew").	required
`releases_dir`	`Path`	Directory containing release .htm files.	required

Yields:

Type	Description
`tuple[str, date, date]`	Tuples of (publication_name, ref_date, vintage_date) for each valid file.

Source code in src/bls_release_dates/parser.py

def collect_release_dates(publication_name: str, releases_dir: Path) -> Iterator[tuple[str, date, date]]:
    """Walk a publication's release directory and yield parsed release rows.

    Glob pattern used: {publication_name}_*.htm. Logs a warning and skips files
    where the vintage date cannot be parsed.

    Args:
        publication_name: Publication name (e.g. "ces", "sae", "qcew").
        releases_dir: Directory containing release .htm files.

    Yields:
        Tuples of (publication_name, ref_date, vintage_date) for each valid file.
    """
    import logging

    log = logging.getLogger(__name__)
    pattern = f"{publication_name}_*.htm"
    for path in sorted(releases_dir.glob(pattern)):
        row = parse_release_file(path, publication_name)
        if row is None:
            log.warning("Could not parse release date from %s", path)
            continue
        yield row

Scraper

`bls_release_dates.scraper`

Fetch BLS archive index pages and download release HTML files.

`ReleaseEntry(ref_year, ref_month, url)` `dataclass`

A single release: reference year, month, and archive URL.

Attributes:

Name	Type	Description
`ref_year`	`int`	Reference year (e.g. 2010).
`ref_month`	`int`	Reference month 1-12.
`url`	`str`	Full URL to the release HTML (e.g. .../archives/empsit_04022010.htm).

`archive_href_re(series)`

Build a regex that matches archive hrefs for the given BLS series.

Parameters:

Name	Type	Description	Default
`series`	`str`	BLS series code (e.g. "empsit", "laus", "cewqtr").	required

Returns:

Type	Description
`Pattern`	Compiled regex matching paths like /news.release/archives/{series}_MMDDYYYY.htm.

Source code in src/bls_release_dates/scraper.py

def archive_href_re(series: str) -> re.Pattern:
    """Build a regex that matches archive hrefs for the given BLS series.

    Args:
        series: BLS series code (e.g. "empsit", "laus", "cewqtr").

    Returns:
        Compiled regex matching paths like /news.release/archives/{series}_MMDDYYYY.htm.
    """
    return re.compile(rf"/news\.release/archives/{re.escape(series)}_\d{{8}}\.htm")

`parse_index_page(html, publication_name, series, frequency)`

Parse an archive index page into release entries.

Only includes entries for years >= START_YEAR. For monthly publications, parses "Month YYYY" from list/link text; for quarterly, parses "First/Second/ Third/Fourth Quarter" and uses the section year.

Parameters:

Name	Type	Description	Default
`html`	`str`	Raw HTML of the BLS news release archive index page.	required
`publication_name`	`str`	Publication name (e.g. "ces", "sae", "qcew").	required
`series`	`str`	BLS series code used to match archive links.	required
`frequency`	`str`	"monthly" or "quarterly".	required

Returns:

Type	Description
`list[ReleaseEntry]`	List of ReleaseEntry (ref_year, ref_month, url) for each release found.

Source code in src/bls_release_dates/scraper.py

def parse_index_page(html: str, publication_name: str, series: str, frequency: str) -> list[ReleaseEntry]:
    """Parse an archive index page into release entries.

    Only includes entries for years >= START_YEAR. For monthly publications,
    parses "Month YYYY" from list/link text; for quarterly, parses "First/Second/
    Third/Fourth Quarter" and uses the section year.

    Args:
        html: Raw HTML of the BLS news release archive index page.
        publication_name: Publication name (e.g. "ces", "sae", "qcew").
        series: BLS series code used to match archive links.
        frequency: "monthly" or "quarterly".

    Returns:
        List of ReleaseEntry (ref_year, ref_month, url) for each release found.
    """
    soup = BeautifulSoup(html, "lxml")
    archive_re = archive_href_re(series)
    entries: list[ReleaseEntry] = []

    for h4 in soup.find_all("h4"):
        year_match = YEAR_RE.search(h4.get_text())
        if not year_match:
            continue
        year = int(year_match.group(1))
        if year < START_YEAR:
            continue

        ul = _find_next_ul(h4)
        if not ul:
            continue

        for li in ul.find_all("li", recursive=False):
            li_text = li.get_text()
            # Find archive .htm link
            anchor = None
            for a in li.find_all("a", href=True):
                if archive_re.search(a.get("href", "")):
                    anchor = a
                    break
            if not anchor:
                continue

            href = anchor.get("href", "")
            if not archive_re.search(href):
                continue
            url = _resolve_url(href)

            if frequency == "monthly":
                month_match = MONTH_YEAR_RE.search(li_text) or MONTH_YEAR_RE.search(anchor.get_text() or "")
                if not month_match:
                    continue
                month_name, year_str = month_match.group(1), month_match.group(2)
                ref_year = int(year_str)
                ref_month = MONTH_TO_NUM.get(month_name)
                if ref_month is None:
                    continue
            else:
                quarter_match = QUARTER_RE.search(li_text)
                if not quarter_match:
                    continue
                quarter_name = quarter_match.group(1)
                ref_year = year
                ref_month = QUARTER_TO_MONTH.get(quarter_name)
                if ref_month is None:
                    continue

            entries.append(ReleaseEntry(ref_year=ref_year, ref_month=ref_month, url=url))

    return entries

`fetch_index(client, url)` `async`

Fetch index page HTML.

Parameters:

Name	Type	Description	Default
`client`	`AsyncClient`	HTTP client to use.	required
`url`	`str`	URL of the archive index page.	required

Returns:

Type	Description
`str`	Response body text. Raises on HTTP errors.

Source code in src/bls_release_dates/scraper.py

async def fetch_index(client: httpx.AsyncClient, url: str) -> str:
    """Fetch index page HTML.

    Args:
        client: HTTP client to use.
        url: URL of the archive index page.

    Returns:
        Response body text. Raises on HTTP errors.
    """
    r = await client.get(url, headers=DEFAULT_HEADERS)
    r.raise_for_status()
    return r.text

`download_one(client, semaphore, entry, publication_name, out_dir)` `async`

Download one release HTML to out_dir/{pub}{yyyy}{mm}.htm.

Skips download if the file already exists. Uses the semaphore to limit concurrency when called from download_all.

Parameters:

Name	Type	Description	Default
`client`	`AsyncClient`	HTTP client to use.	required
`semaphore`	`Semaphore`	Semaphore for concurrency control.	required
`entry`	`ReleaseEntry`	Release entry with ref_year, ref_month, and url.	required
`publication_name`	`str`	Publication name for the filename.	required
`out_dir`	`Path`	Directory to write the .htm file into.	required

Returns:

Type	Description
`Path \| None`	Path to the written or existing file, or None if skipped.

Source code in src/bls_release_dates/scraper.py

async def download_one(
    client: httpx.AsyncClient,
    semaphore: asyncio.Semaphore,
    entry: ReleaseEntry,
    publication_name: str,
    out_dir: Path,
) -> Path | None:
    """Download one release HTML to out_dir/{pub}_{yyyy}_{mm}.htm.

    Skips download if the file already exists. Uses the semaphore to limit
    concurrency when called from download_all.

    Args:
        client: HTTP client to use.
        semaphore: Semaphore for concurrency control.
        entry: Release entry with ref_year, ref_month, and url.
        publication_name: Publication name for the filename.
        out_dir: Directory to write the .htm file into.

    Returns:
        Path to the written or existing file, or None if skipped.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    mm = f"{entry.ref_month:02d}"
    path = out_dir / f"{publication_name}_{entry.ref_year}_{mm}.htm"
    if path.exists():
        return path

    async with semaphore:
        try:
            r = await client.get(entry.url)
            r.raise_for_status()
            path.write_text(r.text, encoding="utf-8")
            return path
        except Exception:
            raise

`download_all(entries, publication_name, concurrency=5)` `async`

Download all release HTMLs for a publication; skip existing files.

Parameters:

Name	Type	Description	Default
`entries`	`list[ReleaseEntry]`	List of ReleaseEntry from parse_index_page.	required
`publication_name`	`str`	Publication name (e.g. "ces", "sae", "qcew").	required
`concurrency`	`int`	Max concurrent requests (default 5).	`5`

Returns:

Type	Description
`list[Path]`	List of paths to written or already-existing .htm files.

Source code in src/bls_release_dates/scraper.py

async def download_all(
    entries: list[ReleaseEntry],
    publication_name: str,
    concurrency: int = 5,
) -> list[Path]:
    """Download all release HTMLs for a publication; skip existing files.

    Args:
        entries: List of ReleaseEntry from parse_index_page.
        publication_name: Publication name (e.g. "ces", "sae", "qcew").
        concurrency: Max concurrent requests (default 5).

    Returns:
        List of paths to written or already-existing .htm files.
    """
    out_dir = DATA_DIR / publication_name
    semaphore = asyncio.Semaphore(concurrency)

    async with httpx.AsyncClient(
        http2=True,
        base_url=BASE_URL,
        follow_redirects=True,
        timeout=30.0,
        headers=DEFAULT_HEADERS,
    ) as client:
        tasks = [
            download_one(client, semaphore, e, publication_name, out_dir)
            for e in entries
        ]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    paths: list[Path] = []
    for i, r in enumerate(results):
        if isinstance(r, Exception):
            raise r
        if r is not None:
            paths.append(r)
    return paths

Read helpers

`bls_release_dates.read`

Read release_dates or vintage_dates parquet files if they exist.

`read_release_dates(path=None)`

Read release_dates parquet if it exists.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str \| None`	Optional path to the parquet file. Defaults to data/release_dates.parquet relative to the current working directory.	`None`

Returns:

Type	Description
`DataFrame \| None`	Polars DataFrame with columns publication, ref_date, vintage_date, or None
`DataFrame \| None`	if the file has not been created yet.

Source code in src/bls_release_dates/read.py

def read_release_dates(path: Path | str | None = None) -> pl.DataFrame | None:
    """Read release_dates parquet if it exists.

    Args:
        path: Optional path to the parquet file. Defaults to data/release_dates.parquet
            relative to the current working directory.

    Returns:
        Polars DataFrame with columns publication, ref_date, vintage_date, or None
        if the file has not been created yet.
    """
    p = Path(path) if path is not None else PARQUET_PATH
    if not p.exists():
        return None
    return pl.read_parquet(p)

`read_vintage_dates(path=None)`

Read vintage_dates parquet if it exists.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str \| None`	Optional path to the parquet file. Defaults to data/vintage_dates.parquet relative to the current working directory.	`None`

Returns:

Type	Description
`DataFrame \| None`	Polars DataFrame with columns publication, ref_date, vintage_date, revision,
`DataFrame \| None`	benchmark_revision, or None if the file has not been created yet.

Source code in src/bls_release_dates/read.py

def read_vintage_dates(path: Path | str | None = None) -> pl.DataFrame | None:
    """Read vintage_dates parquet if it exists.

    Args:
        path: Optional path to the parquet file. Defaults to data/vintage_dates.parquet
            relative to the current working directory.

    Returns:
        Polars DataFrame with columns publication, ref_date, vintage_date, revision,
        benchmark_revision, or None if the file has not been created yet.
    """
    p = Path(path) if path is not None else VINTAGE_DATES_PATH
    if not p.exists():
        return None
    return pl.read_parquet(p)

Vintage dates (revisions)

`bls_release_dates.vintage_dates`

Build vintage_dates dataset from release_dates.parquet with revision codes.

Revision semantics (publication-specific; may not hold for most recent ref_dates):

0: initial release (vintage_date from release_dates.parquet)
1, 2, ...: subsequent revisions (vintage_date shifted by 1, 2, ... months)
9: benchmark revision (CES and SAE only)

benchmark_revision: 0 = not a benchmark row; 1 = first benchmark; 2 = second benchmark (SAE re-replacement only).

CES: revisions 0, 1, 2, and 9. Benchmark 9 only for March ref_date (vintage = Jan release next year); benchmark_revision=1.
SAE: revisions 0, 1, and 9. Benchmark 9 twice for April–September ref_dates (double-revision): first at March Y+1 (benchmark_revision=1), second at March Y+2 (benchmark_revision=2).
QCEW: by quarter of ref_date — Q1: 0,1,2,3,4; Q2: 0,1,2,3; Q3: 0,1,2; Q4: 0,1. No benchmarks (benchmark_revision=0).

`build_vintage_dates(release_dates_path=None)`

Build vintage_dates DataFrame from release_dates parquet.

Applies publication-specific revision logic (CES 0,1,2 + benchmark; SAE 0,1 + benchmarks; QCEW 0..max by quarter), filters to vintage_date <= today, and sorts by publication, ref_date, vintage_date, revision, benchmark_revision.

Parameters:

Name	Type	Description	Default
`release_dates_path`	`Path \| None`	Path to release_dates.parquet. Defaults to config.PARQUET_PATH.	`None`

Returns:

Type	Description
`DataFrame`	Polars DataFrame with columns publication, ref_date, vintage_date,
`DataFrame`	revision, benchmark_revision.

Source code in src/bls_release_dates/vintage_dates.py

def build_vintage_dates(release_dates_path: Path | None = None) -> pl.DataFrame:
    """Build vintage_dates DataFrame from release_dates parquet.

    Applies publication-specific revision logic (CES 0,1,2 + benchmark;
    SAE 0,1 + benchmarks; QCEW 0..max by quarter), filters to vintage_date
    <= today, and sorts by publication, ref_date, vintage_date, revision,
    benchmark_revision.

    Args:
        release_dates_path: Path to release_dates.parquet. Defaults to
            config.PARQUET_PATH.

    Returns:
        Polars DataFrame with columns publication, ref_date, vintage_date,
        revision, benchmark_revision.
    """
    path = release_dates_path or PARQUET_PATH
    df = pl.read_parquet(path)

    # Publication-specific revisions: CES 0,1,2; SAE 0,1; QCEW 0..max by quarter
    with_revisions = pl.concat([
        _add_ces_revisions(df),
        _add_sae_revisions(df),
        _add_qcew_revisions(df),
    ])

    # Benchmark revisions (CES March, SAE Apr–Sep only)
    ces_bench = _ces_benchmark_vintage_dates(df)
    sae_bench = _sae_benchmark_vintage_dates(df)
    benchmark_rows = pl.concat([ces_bench, sae_bench])

    out = (
        pl.concat([with_revisions, benchmark_rows])
        .filter(pl.col("vintage_date") <= pl.lit(date.today()))
        .sort(["publication", "ref_date", "vintage_date", "revision", "benchmark_revision"])
    )
    return out

`main()`

Build vintage_dates from release_dates and write data/vintage_dates.parquet.

Reads data/release_dates.parquet, applies revision logic, and writes data/vintage_dates.parquet. Creates the output directory if needed.

Source code in src/bls_release_dates/vintage_dates.py

def main() -> None:
    """Build vintage_dates from release_dates and write data/vintage_dates.parquet.

    Reads data/release_dates.parquet, applies revision logic, and writes
    data/vintage_dates.parquet. Creates the output directory if needed.
    """
    df = build_vintage_dates()
    VINTAGE_DATES_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.write_parquet(VINTAGE_DATES_PATH)
    print(f"Wrote {VINTAGE_DATES_PATH} ({len(df)} rows)")

Entry point / main

`bls_release_dates.main`

CLI entry point: download BLS releases, build release_dates and vintage_dates.

`download_all_publications()` `async`

Download release HTML files for all configured publications.

Source code in src/bls_release_dates/__main__.py

async def download_all_publications() -> None:
    """Download release HTML files for all configured publications."""
    async with httpx.AsyncClient(
        http2=True, follow_redirects=True, timeout=30.0,
    ) as client:
        for pub in PUBLICATIONS:
            print(f"Fetching index for {pub.name}...")
            html = await fetch_index(client, pub.index_url)
            entries = parse_index_page(html, pub.name, pub.series, pub.frequency)
            print(f"  Found {len(entries)} releases for {pub.name}")
            paths = await download_all(entries, pub.name)
            print(f"  Downloaded {len(paths)} new files for {pub.name}")

`build_dataframe()`

Parse all downloaded HTML files into a release_dates DataFrame.

Source code in src/bls_release_dates/__main__.py

def build_dataframe() -> pl.DataFrame:
    """Parse all downloaded HTML files into a release_dates DataFrame."""
    rows = []
    for pub in PUBLICATIONS:
        pub_dir = DATA_DIR / pub.name
        if not pub_dir.exists():
            continue
        for row in collect_release_dates(pub.name, pub_dir):
            rows.append(row)

    df = pl.DataFrame(
        rows, schema={"publication": pl.Utf8, "ref_date": pl.Date, "vintage_date": pl.Date},
        orient="row",
    ).sort("publication", "ref_date")
    return df

`main()`

Run full pipeline: download, build release_dates, build vintage_dates.

Source code in src/bls_release_dates/__main__.py

def main() -> None:
    """Run full pipeline: download, build release_dates, build vintage_dates."""
    asyncio.run(download_all_publications())

    print("Building release_dates...")
    df = build_dataframe()
    PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.write_parquet(PARQUET_PATH)
    print(f"Wrote {PARQUET_PATH} ({len(df)} rows)")

    print("Building vintage_dates...")
    vdf = build_vintage_dates()
    VINTAGE_DATES_PATH.parent.mkdir(parents=True, exist_ok=True)
    vdf.write_parquet(VINTAGE_DATES_PATH)
    print(f"Wrote {VINTAGE_DATES_PATH} ({len(vdf)} rows)")

API Reference

Package

bls_release_dates

Publication(name, series, index_url, frequency) dataclass

main()

build_dataframe()

download_all_publications() async

read_release_dates(path=None)

read_vintage_dates(path=None)

Configuration

bls_release_dates.config

Publication(name, series, index_url, frequency) dataclass

Parser

bls_release_dates.parser

parse_vintage_date(html_content)

parse_ref_from_path(path)

ref_date_from_year_month(year, month)

parse_release_file(path, publication_name)

collect_release_dates(publication_name, releases_dir)

Scraper

bls_release_dates.scraper

ReleaseEntry(ref_year, ref_month, url) dataclass

archive_href_re(series)

parse_index_page(html, publication_name, series, frequency)

fetch_index(client, url) async

download_one(client, semaphore, entry, publication_name, out_dir) async

download_all(entries, publication_name, concurrency=5) async

Read helpers

bls_release_dates.read

read_release_dates(path=None)

read_vintage_dates(path=None)

Vintage dates (revisions)

bls_release_dates.vintage_dates

build_vintage_dates(release_dates_path=None)

main()

Entry point / main

bls_release_dates.__main__

download_all_publications() async

build_dataframe()

main()

`bls_release_dates`

`Publication(name, series, index_url, frequency)` `dataclass`

`main()`

`build_dataframe()`

`download_all_publications()` `async`

`read_release_dates(path=None)`

`read_vintage_dates(path=None)`

`bls_release_dates.config`

`Publication(name, series, index_url, frequency)` `dataclass`

`bls_release_dates.parser`

`parse_vintage_date(html_content)`

`parse_ref_from_path(path)`

`ref_date_from_year_month(year, month)`

`parse_release_file(path, publication_name)`

`collect_release_dates(publication_name, releases_dir)`

`bls_release_dates.scraper`

`ReleaseEntry(ref_year, ref_month, url)` `dataclass`

`archive_href_re(series)`

`parse_index_page(html, publication_name, series, frequency)`

`fetch_index(client, url)` `async`

`download_one(client, semaphore, entry, publication_name, out_dir)` `async`

`download_all(entries, publication_name, concurrency=5)` `async`

`bls_release_dates.read`

`read_release_dates(path=None)`

`read_vintage_dates(path=None)`

`bls_release_dates.vintage_dates`

`build_vintage_dates(release_dates_path=None)`

`main()`

`bls_release_dates.main`

`download_all_publications()` `async`

`build_dataframe()`

`main()`