MOD: Modify 5GB size warning heuristics

nmacholl · nmacholl · commit eab47a632024 · 2023-05-30T23:44:14.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ## 0.13.0 - TBD
 - Added support for `statistics` schema
-- Upgraded `databento-dbn` to 0.5.1
+- Upgraded `databento-dbn` to 0.6.0
 - Renamed `booklevel` MBP field to `levels` for brevity and consistent naming
 - Changed `flags` field to an unsigned int
 - Changed default of `ts_out` to `False` for `Live` client
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -34,7 +34,7 @@
 from databento.common.symbology import InstrumentIdMappingInterval
 from databento.common.validation import validate_maybe_enum
 from databento.live.data import DBNStruct
-from databento_dbn import DbnDecoder, ErrorMsg, Metadata, SymbolMappingMsg, SystemMsg
+from databento_dbn import DBNDecoder, ErrorMsg, Metadata, SymbolMappingMsg, SystemMsg
 
 
 NON_SCHEMA_RECORD_TYPES = [
@@ -354,8 +354,7 @@ def __init__(self, data_source: DataSource) -> None:
 
     def __iter__(self) -> Generator[DBNStruct, None, None]:
         reader = self.reader
-        attach_ts_out = self.metadata.ts_out
-        decoder = DbnDecoder()
+        decoder = DBNDecoder()
         while True:
             raw = reader.read(DBNStore.DBN_READ_SIZE)
             if raw:
@@ -364,9 +363,7 @@ def __iter__(self) -> Generator[DBNStruct, None, None]:
                     records = decoder.decode()
                 except ValueError:
                     continue
-                for record, ts_out in records:
-                    if attach_ts_out and not isinstance(record, Metadata):
-                        setattr(record, "ts_out", ts_out)
+                for record in records:
                     yield record
             else:
                 if len(decoder.buffer()) > 0:
diff --git a/databento/historical/api/timeseries.py b/databento/historical/api/timeseries.py
@@ -18,9 +18,13 @@
 )
 from databento.common.validation import validate_enum, validate_semantic_string
 from databento.historical.api import API_VERSION
+from databento.historical.api.metadata import MetadataHttpAPI
 from databento.historical.http import BentoHttpAPI
 
 
+WARN_REQUEST_SIZE: int = 5 * 10**9  # 5 GB
+
+
 class TimeSeriesHttpAPI(BentoHttpAPI):
     """
     Provides request methods for the time series HTTP API endpoints.
@@ -127,10 +131,12 @@ def get_range(
         stype_in_valid = validate_enum(stype_in, SType, "stype_in")
         symbols_list = optional_symbols_list_to_string(symbols, stype_in_valid)
         schema_valid = validate_enum(schema, Schema, "schema")
+        start_valid = datetime_to_string(start)
+        end_valid = optional_datetime_to_string(end)
         params: List[Tuple[str, Optional[str]]] = [
             ("dataset", validate_semantic_string(dataset, "dataset")),
-            ("start", datetime_to_string(start)),
-            ("end", optional_datetime_to_string(end)),
+            ("start", start_valid),
+            ("end", end_valid),
             ("symbols", symbols_list),
             ("schema", str(schema_valid)),
             ("stype_in", str(stype_in_valid)),
@@ -144,10 +150,12 @@ def get_range(
             params.append(("limit", str(limit)))
 
         self._pre_check_data_size(
-            symbols=symbols,
+            dataset=dataset,
+            stype_in=stype_in_valid,
+            symbols=symbols_list,
             schema=schema_valid,
-            start=start,
-            end=end,
+            start=start_valid,
+            end=end_valid,
             limit=limit,
         )
 
@@ -267,10 +275,12 @@ async def get_range_async(
         stype_in_valid = validate_enum(stype_in, SType, "stype_in")
         symbols_list = optional_symbols_list_to_string(symbols, stype_in_valid)
         schema_valid = validate_enum(schema, Schema, "schema")
+        start_valid = datetime_to_string(start)
+        end_valid = optional_datetime_to_string(end)
         params: List[Tuple[str, Optional[str]]] = [
             ("dataset", validate_semantic_string(dataset, "dataset")),
-            ("start", datetime_to_string(start)),
-            ("end", optional_datetime_to_string(end)),
+            ("start", start_valid),
+            ("end", end_valid),
             ("symbols", symbols_list),
             ("schema", str(schema_valid)),
             ("stype_in", str(stype_in_valid)),
@@ -283,10 +293,12 @@ async def get_range_async(
             params.append(("limit", str(limit)))
 
         self._pre_check_data_size(
-            symbols=symbols,
+            dataset=dataset,
+            stype_in=stype_in_valid,
+            symbols=symbols_list,
             schema=schema_valid,
-            start=start,
-            end=end,
+            start=start_valid,
+            end=end_valid,
             limit=limit,
         )
 
@@ -308,59 +320,100 @@ async def get_range_async(
         writer.seek(0)  # rewind for read
         return DBNStore.from_bytes(writer.read())
 
-    def _pre_check_data_size(  # noqa (prefer not to make static)
+    def _pre_check_data_size(
         self,
-        symbols: Optional[Union[List[str], str]],
+        dataset: str,
+        symbols: str,
         schema: Schema,
-        start: Optional[Union[pd.Timestamp, date, str, int]],
-        end: Optional[Union[pd.Timestamp, date, str, int]],
+        start: str,
+        end: Optional[str],
+        stype_in: SType,
         limit: Optional[int],
     ) -> None:
-        if limit and limit < 10**7:
+        if _is_size_limited(
+            schema=schema,
+            limit=limit,
+        ):
             return
 
-        # Use heuristics to check ballpark data size
-        if (
-            _is_large_data_size_schema(schema)
-            or _is_greater_than_one_day(start, end)
-            or _is_large_number_of_symbols(symbols)
+        if _is_period_limited(
+            schema=schema,
+            symbols=symbols,
+            start=start,
+            end=end,
         ):
-            warnings.warn(
-                message="The size of this streaming request is estimated "
-                "to be 5 GB or greater.\nWe recommend breaking your request "
-                "into smaller requests, or submitting a batch download request.\n"
-                "This warning can be suppressed: "
-                "https://docs.python.org/3/library/warnings.html",
-                category=BentoWarning,
-                stacklevel=3,  # This makes the error happen in user code
-            )
-
+            return
 
-def _is_large_number_of_symbols(symbols: Optional[Union[List[str], str]]) -> bool:
-    if not symbols:
-        return True  # Full universe
+        metadata_api = MetadataHttpAPI(
+            key=self._key,
+            gateway=self._gateway,
+        )
+        request_size = metadata_api.get_billable_size(
+            dataset=dataset,
+            start=start,
+            end=end,
+            symbols=symbols,
+            schema=schema,
+            stype_in=stype_in,
+            limit=limit,
+        )
 
-    if isinstance(symbols, str):
-        symbols = symbols.split(",")
+        if request_size < WARN_REQUEST_SIZE:
+            return
 
-    if len(symbols) >= 500:
-        return True
+        warnings.warn(
+            message="""The size of this streaming request is greater than 5GB.
+            It is recommended to submit a batch download request for large volumes
+            of data, or break this request into smaller requests.
+            This warning can be suppressed:
+            https://docs.python.org/3/library/warnings.html""",
+            category=BentoWarning,
+            stacklevel=3,  # This makes the error happen in user code
+        )
 
-    return False
 
+def _is_size_limited(
+    schema: Schema,
+    limit: Optional[int],
+    max_size: int = WARN_REQUEST_SIZE,
+) -> bool:
+    if limit is None:
+        return False
 
-def _is_large_data_size_schema(schema: Schema) -> bool:
-    return schema in (Schema.MBO, Schema.MBP_10)
+    estimated_size = limit * schema.get_record_type().size_hint()
+    return estimated_size < max_size
 
 
-def _is_greater_than_one_day(
-    start: Optional[Union[pd.Timestamp, date, str, int]],
-    end: Optional[Union[pd.Timestamp, date, str, int]],
+def _is_period_limited(
+    schema: Schema,
+    symbols: str,
+    start: str,
+    end: Optional[str],
+    max_size: int = WARN_REQUEST_SIZE,
 ) -> bool:
-    if start is None or end is None:
-        return True
-
-    if pd.to_datetime(end) - pd.to_datetime(start) > pd.Timedelta(days=1):
-        return True
-
-    return False
+    if end is None:
+        return False
+
+    if schema not in (
+        Schema.OHLCV_1S,
+        Schema.OHLCV_1M,
+        Schema.OHLCV_1H,
+        Schema.OHLCV_1D,
+        Schema.DEFINITION,
+    ):
+        return False
+
+    dt_start = pd.to_datetime(start, utc=True)
+    dt_end = pd.to_datetime(end, utc=True)
+
+    # default scale to one day for ohlcv_1d and definition
+    scale = {
+        Schema.OHLCV_1S: 1,
+        Schema.OHLCV_1M: 60,
+        Schema.OHLCV_1H: 60 * 60,
+    }.get(schema, 60 * 60 * 24)
+
+    num_symbols = len(symbols.split(","))
+    num_records = num_symbols * (dt_end - dt_start).total_seconds() // scale
+    estimated_size = num_records * schema.get_record_type().size_hint()
+    return estimated_size < max_size
diff --git a/databento/live/data.py b/databento/live/data.py
@@ -73,10 +73,10 @@ async def _stream_task(
         stream: IO[bytes],
         record: DBNStruct,
     ) -> None:
-        ts_out = getattr(record, "ts_out")
+        ts_out = getattr(record, "ts_out", None)
         try:
             stream.write(bytes(record))
-            if not isinstance(record, databento_dbn.Metadata) and ts_out:
+            if not isinstance(record, databento_dbn.Metadata) and ts_out is not None:
                 stream.write(struct.pack("Q", ts_out))
         except Exception as exc:
             stream_name = getattr(stream, "name", str(stream))
diff --git a/databento/live/dbn.py b/databento/live/dbn.py
@@ -57,7 +57,7 @@ def __init__(
         self._buffer: bytearray = bytearray()
         self._client_callback = client_callback
         self._dbn_queue: Optional[DBNQueue] = None
-        self._decoder: databento_dbn.DbnDecoder = databento_dbn.DbnDecoder()
+        self._decoder: databento_dbn.DBNDecoder = databento_dbn.DBNDecoder()
         self._disconnected: "asyncio.Future[None]" = loop.create_future()
         self._transport_lock = threading.Lock()
         self._transport = transport
@@ -184,19 +184,15 @@ def buffer_updated(self, nbytes: int) -> None:
         except ValueError:
             pass
         else:
-            for record, ts_out in records:
+            for record in records:
                 header = getattr(record, "hd", object())
                 ts_event = getattr(header, "ts_event", None)
                 logger.info(
-                    "decoded as %s record ts_event=%s ts_out=%s",
+                    "decoded as %s record ts_event=%s",
                     type(record).__name__,
                     ts_event,
-                    ts_out,
                 )
 
-                if not isinstance(record, databento_dbn.Metadata):
-                    setattr(record, "ts_out", ts_out)
-
                 # Record Dispatch
                 self._client_callback(record)
 
diff --git a/databento/version.py b/databento/version.py
@@ -1 +1 @@
-__version__ = "0.12.0"
+__version__ = "0.13.0"
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 aiohttp>=3.7.2,<4.0.0
-databento-dbn==0.5.1
+databento-dbn==0.6.0
 numpy>=1.17.0
 pandas>=1.1.3
 requests>=2.24.0
diff --git a/tests/test_historical_timeseries.py b/tests/test_historical_timeseries.py
@@ -7,6 +7,7 @@
 import requests
 from databento import DBNStore
 from databento.common.enums import Schema
+from databento.historical.api.timeseries import TimeSeriesHttpAPI
 from pytest_mock import MockerFixture
 
 
@@ -62,6 +63,11 @@ def test_get_range_sends_expected_request(
 
         # Mock from_bytes with the definition stub
         stream_bytes = test_data(Schema.TRADES)
+        monkeypatch.setattr(
+            TimeSeriesHttpAPI,
+            "_pre_check_data_size",
+            MagicMock(return_value=True),
+        )
         monkeypatch.setattr(
             DBNStore,
             "from_bytes",

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.12.0"`
	`1`	`+__version__ = "0.13.0"`