Merge pull request #3 from saswati2/califorest-saswati

kumar70uiuc · web-flow · commit c9d23be04524 · 2026-04-13T20:06:58.000-05:00
Califorest saswati
diff --git a/califorest_tests/__init__.py b/califorest_tests/__init__.py
diff --git a/califorest_tests/test_datasets.py b/califorest_tests/test_datasets.py
@@ -0,0 +1,60 @@
+import json
+from califorest_tests.utils import create_temp_dataset
+
+"""
+Dataset tests using small synthetic EHR data.
+
+These tests verify:
+- JSON dataset loading
+- Patient and visit structure
+- Event field integrity
+
+All tests use temporary directories and synthetic data to ensure
+fast execution and full isolation.
+"""
+
+def test_dataset_loading():
+
+    #Verify that synthetic dataset JSON can be loaded correctly.
+    temp_dir, data_path = create_temp_dataset()
+
+    with open(data_path) as f:
+        data = json.load(f)
+
+    assert len(data) > 0  # patients exist
+    assert isinstance(data, list)  # (data integrity)
+
+    temp_dir.cleanup()
+
+
+def test_patient_structure():
+
+    #Verify each patient contains required fields.#
+    temp_dir, data_path = create_temp_dataset()
+
+    with open(data_path) as f:
+        patients = json.load(f)
+
+    patient = patients[0]
+    assert "patient_id" in patient
+    assert "visits" in patient
+    assert len(patient["visits"]) == 2
+
+    temp_dir.cleanup()
+
+
+def test_visit_structure():
+    #Verify each visit contains required event fields.
+    temp_dir, data_path = create_temp_dataset()
+
+    with open(data_path) as f:
+        patients = json.load(f)
+
+    visit = patients[0]["visits"][0]
+
+    assert "conditions" in visit
+    assert "procedures" in visit
+    assert "drugs" in visit
+    assert "label" in visit
+
+    temp_dir.cleanup()
diff --git a/califorest_tests/test_models.py b/califorest_tests/test_models.py
@@ -0,0 +1,55 @@
+import torch
+import torch.nn as nn
+from califorest_tests.utils import create_synthetic_ehr
+
+"""
+Model unit tests using tiny synthetic tensors.
+
+These tests verify:
+- Model instantiation
+- Forward pass correctness
+- Output shape validation
+- Gradient computation during backpropagation
+"""
+
+class TinyModel(nn.Module):
+    def __init__(self, in_features=8):
+        super().__init__()
+        self.fc = nn.Linear(in_features, 1)
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+def test_model_instantiation():
+    #Model can be created successfully
+    model = TinyModel()
+    assert model is not None
+
+
+def test_forward_pass():
+    #Forward pass returns outputs with correct batch size.
+    X, y = create_synthetic_ehr()
+    model = TinyModel()
+
+    x_tensor = torch.tensor(X, dtype=torch.float32)
+    output = model(x_tensor)
+
+    assert output.shape[0] == X.shape[0]
+
+
+def test_backward_pass():
+    #Backward pass computes gradients successfully
+    X, y = create_synthetic_ehr()
+    model = TinyModel()
+
+    x_tensor = torch.tensor(X, dtype=torch.float32)
+    y_tensor = torch.tensor(y, dtype=torch.float32).view(-1,1)
+
+    criterion = nn.BCEWithLogitsLoss()
+
+    output = model(x_tensor)
+    loss = criterion(output, y_tensor)
+    loss.backward()
+
+    assert model.fc.weight.grad is not None
diff --git a/califorest_tests/test_tasks.py b/califorest_tests/test_tasks.py
@@ -0,0 +1,45 @@
+import numpy as np
+from califorest_tests.utils import create_synthetic_patient_records
+
+"""
+Task pipeline tests using synthetic patient records.
+
+These tests validate:
+- Sample processing
+- Feature extraction
+- Label generation
+- Edge case handling
+"""
+
+def process_samples(patients):
+    """Fake task pipeline"""
+    X = []
+    y = []
+
+    for p in patients:
+        for v in p["visits"]:
+            features = len(v["conditions"]) + len(v["drugs"])
+            X.append(features)
+            y.append(v["label"])
+
+    return np.array(X), np.array(y)
+
+
+def test_sample_processing():
+    patients = create_synthetic_patient_records()
+    X, y = process_samples(patients)
+
+    assert len(X) == len(y)
+    assert X.ndim == 1
+
+
+def test_label_generation():
+    patients = create_synthetic_patient_records()
+    _, y = process_samples(patients)
+
+    assert set(y).issubset({0,1})
+
+
+def test_edge_cases_empty_patient():
+    X, y = process_samples([])
+    assert len(X) == 0
diff --git a/califorest_tests/utils.py b/califorest_tests/utils.py
@@ -0,0 +1,109 @@
+"""
+Utility functions for generating small synthetic data used in tests.
+
+These helpers ensure tests run quickly, use no real datasets, and
+remain fully isolated from external dependencies.
+"""
+
+import json
+import numpy as np
+import tempfile
+from pathlib import Path
+
+
+# ---------------------------------------------------
+# Synthetic DATASET generator (for dataset tests)
+# ---------------------------------------------------
+
+def create_temp_dataset():
+    """
+    Create a temporary JSON dataset with 2 synthetic patients.
+
+    Returns
+    -------
+    temp_dir : TemporaryDirectory
+        Temporary directory object (must be cleaned up by tests).
+    data_path : Path
+        Path to the generated JSON file.
+    """
+    temp_dir = tempfile.TemporaryDirectory()
+    data_path = Path(temp_dir.name) / "patients.json"
+
+    patients = create_synthetic_patient_records()
+
+    with open(data_path, "w") as f:
+        json.dump(patients, f)
+
+    return temp_dir, data_path
+
+
+# ---------------------------------------------------
+# Synthetic PATIENT records (for dataset + task tests)
+# ---------------------------------------------------
+
+def create_synthetic_patient_records():
+    """
+    Generate tiny synthetic EHR patient records.
+
+    Returns
+    -------
+    list
+        List of patient dictionaries with visit information.
+    """
+    return [
+        {
+            "patient_id": "p1",
+            "visits": [
+                {
+                    "conditions": ["c1", "c2"],
+                    "procedures": ["p1"],
+                    "drugs": ["d1"],
+                    "label": 1,
+                },
+                {
+                    "conditions": ["c3"],
+                    "procedures": ["p2"],
+                    "drugs": ["d2", "d3"],
+                    "label": 0,
+                },
+            ],
+        },
+        {
+            "patient_id": "p2",
+            "visits": [
+                {
+                    "conditions": ["c4"],
+                    "procedures": [],
+                    "drugs": ["d4"],
+                    "label": 0,
+                }
+            ],
+        },
+    ]
+
+
+# ---------------------------------------------------
+# Synthetic MODEL tensors (for model tests)
+# ---------------------------------------------------
+
+def create_synthetic_ehr(n_samples: int = 4, n_features: int = 8):
+    """
+    Generate tiny synthetic tensors for model testing.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples to generate.
+    n_features : int
+        Number of input features.
+
+    Returns
+    -------
+    X : np.ndarray
+        Feature matrix.
+    y : np.ndarray
+        Binary labels.
+    """
+    X = np.random.rand(n_samples, n_features).astype(np.float32)
+    y = np.random.randint(0, 2, size=(n_samples,)).astype(np.float32)
+    return X, y
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+def create_synthetic_ehr(num_patients=10, num_features=8):
+    print("Testing Utils")
+    """
+    Creates fake patient EHR data for testing.
+    Returns X features and y labels.
+    """
+    np.random.seed(0)
+    X = np.random.rand(num_patients, num_features)
+    y = np.random.randint(0, 2, size=num_patients)
+    
+    return X, y
+
+
+if __name__ == "__main__":
+    X, y = create_synthetic_ehr() 
+    print("X shape:", X.shape)  
+    print("y shape:", y.shape)