|
| 1 | +"""timdex_dataset_api/run.py""" |
| 2 | + |
| 3 | +import concurrent.futures |
| 4 | +import logging |
| 5 | +import time |
| 6 | +from typing import TYPE_CHECKING |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +import pyarrow.parquet as pq |
| 10 | + |
| 11 | +if TYPE_CHECKING: |
| 12 | + from timdex_dataset_api.dataset import TIMDEXDataset |
| 13 | + |
| 14 | +logger = logging.getLogger(__name__) |
| 15 | + |
| 16 | + |
| 17 | +class TIMDEXRunManager: |
| 18 | + """Manages and provides access to ETL run metadata from the TIMDEX parquet dataset.""" |
| 19 | + |
| 20 | + def __init__(self, timdex_dataset: "TIMDEXDataset"): |
| 21 | + self.timdex_dataset: TIMDEXDataset = timdex_dataset |
| 22 | + if self.timdex_dataset.dataset is None: |
| 23 | + self.timdex_dataset.load() |
| 24 | + |
| 25 | + self._runs_metadata_cache: pd.DataFrame | None = None |
| 26 | + |
| 27 | + def clear_cache(self) -> None: |
| 28 | + self._runs_metadata_cache = None |
| 29 | + |
| 30 | + def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame: |
| 31 | + """Get metadata for all runs in dataset, grouped by run_id. |
| 32 | +
|
| 33 | + The dataframe returned includes the following columns: |
| 34 | + - source |
| 35 | + - run_date |
| 36 | + - run_type |
| 37 | + - run_id |
| 38 | + - num_rows: total number of records for that run_id |
| 39 | + - parquet_files: list of parquet file(s) that are associated with that run |
| 40 | +
|
| 41 | + Args: |
| 42 | + refresh: If True, force refresh of cached metadata |
| 43 | + """ |
| 44 | + start_time = time.perf_counter() |
| 45 | + |
| 46 | + if self._runs_metadata_cache is not None and not refresh: |
| 47 | + return self._runs_metadata_cache |
| 48 | + |
| 49 | + ungrouped_runs_df = self._get_parquet_files_run_metadata() |
| 50 | + if ungrouped_runs_df.empty: |
| 51 | + return ungrouped_runs_df |
| 52 | + |
| 53 | + # group by run_id |
| 54 | + grouped_runs_df = ( |
| 55 | + ungrouped_runs_df.groupby("run_id") |
| 56 | + .agg( |
| 57 | + { |
| 58 | + "source": "first", |
| 59 | + "run_date": "first", |
| 60 | + "run_type": "first", |
| 61 | + "num_rows": "sum", |
| 62 | + "filename": list, |
| 63 | + } |
| 64 | + ) |
| 65 | + .reset_index() |
| 66 | + ) |
| 67 | + |
| 68 | + # add additional metadata |
| 69 | + grouped_runs_df = grouped_runs_df.rename(columns={"filename": "parquet_files"}) |
| 70 | + grouped_runs_df["parquet_files_count"] = grouped_runs_df["parquet_files"].apply( |
| 71 | + lambda x: len(x) |
| 72 | + ) |
| 73 | + |
| 74 | + # sort by run date and source |
| 75 | + grouped_runs_df = grouped_runs_df.sort_values( |
| 76 | + ["run_date", "source"], ascending=False |
| 77 | + ) |
| 78 | + |
| 79 | + # cache the result |
| 80 | + self._runs_metadata_cache = grouped_runs_df |
| 81 | + |
| 82 | + logger.info( |
| 83 | + f"Dataset runs metadata retrieved, elapsed: " |
| 84 | + f"{round(time.perf_counter() - start_time, 2)}s, runs: {len(grouped_runs_df)}" |
| 85 | + ) |
| 86 | + return grouped_runs_df |
| 87 | + |
| 88 | + def get_current_source_parquet_files(self, source: str) -> list[str]: |
| 89 | + """Get reverse chronological list of current parquet files for a source. |
| 90 | +
|
| 91 | + Args: |
| 92 | + source: The source identifier to filter runs |
| 93 | + """ |
| 94 | + runs_df = self.get_runs_metadata() |
| 95 | + source_runs_df = runs_df[runs_df.source == source].copy() |
| 96 | + |
| 97 | + # get last "full" run |
| 98 | + full_runs_df = source_runs_df[source_runs_df.run_type == "full"] |
| 99 | + if len(full_runs_df) == 0: |
| 100 | + raise RuntimeError( |
| 101 | + f"Could not find the most recent 'full' run for source: '{source}'" |
| 102 | + ) |
| 103 | + last_full_run = full_runs_df.iloc[0] |
| 104 | + |
| 105 | + # get all "daily" runs since |
| 106 | + daily_runs_df = source_runs_df[ |
| 107 | + (source_runs_df.run_type == "daily") |
| 108 | + & (source_runs_df.run_date >= last_full_run.run_date) |
| 109 | + ] |
| 110 | + |
| 111 | + ordered_parquet_files = [] |
| 112 | + for _, daily_run in daily_runs_df.iterrows(): |
| 113 | + ordered_parquet_files.extend(daily_run.parquet_files) |
| 114 | + ordered_parquet_files.extend(last_full_run.parquet_files) |
| 115 | + |
| 116 | + return ordered_parquet_files |
| 117 | + |
| 118 | + def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame: |
| 119 | + """Retrieve run metadata from parquet file(s) in dataset. |
| 120 | +
|
| 121 | + A single ETL run may still be spread across multiple Parquet files making this |
| 122 | + data ungrouped by run. |
| 123 | +
|
| 124 | + Args: |
| 125 | + max_workers: Maximum number of parallel workers for processing |
| 126 | + - a high number is generally safe given the lightweight nature of the |
| 127 | + thread's work, just reading a few parquet file header bytes |
| 128 | + """ |
| 129 | + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: |
| 130 | + futures = [] |
| 131 | + for parquet_filepath in self.timdex_dataset.dataset.files: # type: ignore[attr-defined] |
| 132 | + future = executor.submit( |
| 133 | + self._parse_run_metadata_from_parquet_file, |
| 134 | + parquet_filepath, |
| 135 | + ) |
| 136 | + futures.append(future) |
| 137 | + |
| 138 | + done, not_done = concurrent.futures.wait( |
| 139 | + futures, return_when=concurrent.futures.ALL_COMPLETED |
| 140 | + ) |
| 141 | + |
| 142 | + results = [] |
| 143 | + for future in done: |
| 144 | + try: |
| 145 | + if result := future.result(): |
| 146 | + results.append(result) |
| 147 | + except Exception: |
| 148 | + logger.exception("Error reading run metadata from parquet file.") |
| 149 | + |
| 150 | + return pd.DataFrame(results) if results else pd.DataFrame() |
| 151 | + |
| 152 | + def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict: |
| 153 | + """Parse source, run_date, run_type, and run_id from a single Parquet file. |
| 154 | +
|
| 155 | + The TIMDEX parquet dataset has a characteristic that we can use for extracting |
| 156 | + run information from a single row in a parquet file: all rows in the parquet file |
| 157 | + share the column values source, run_date, run_type, and run_id. |
| 158 | +
|
| 159 | + Taking this a step further, we can extract these values without even touching a |
| 160 | + single proper row from the parquet file, but from reading the parquet file |
| 161 | + column statistics. In this way, we can extract run information from a parquet |
| 162 | + file by only reading the lightweight parquet file metadata. |
| 163 | +
|
| 164 | + Args: |
| 165 | + parquet_filepath: Path to the parquet file |
| 166 | + """ |
| 167 | + parquet_file = pq.ParquetFile( |
| 168 | + parquet_filepath, |
| 169 | + filesystem=self.timdex_dataset.filesystem, # type: ignore[union-attr] |
| 170 | + ) |
| 171 | + file_meta = parquet_file.metadata.to_dict() |
| 172 | + num_rows = file_meta["num_rows"] |
| 173 | + columns_meta = file_meta["row_groups"][0]["columns"] # type: ignore[typeddict-item] |
| 174 | + source = columns_meta[3]["statistics"]["max"] |
| 175 | + run_date = columns_meta[4]["statistics"]["max"] |
| 176 | + run_type = columns_meta[5]["statistics"]["max"] |
| 177 | + run_id = columns_meta[7]["statistics"]["max"] |
| 178 | + |
| 179 | + return { |
| 180 | + "source": source, |
| 181 | + "run_date": run_date, |
| 182 | + "run_type": run_type, |
| 183 | + "run_id": run_id, |
| 184 | + "num_rows": num_rows, |
| 185 | + "filename": parquet_filepath, |
| 186 | + } |
0 commit comments