structuredllm
diff --git a/‎.github/workflows/run_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/run_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 2 additions & 2 deletions b/‎setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎syncode/common.py‎
Lines changed: 26 additions & 10 deletions b/‎syncode/common.py‎
Lines changed: 26 additions & 10 deletions
diff --git a/‎syncode/evaluation/json_eval.py‎
Lines changed: 4 additions & 1 deletion b/‎syncode/evaluation/json_eval.py‎
Lines changed: 4 additions & 1 deletion
@@ -27,7 +27,7 @@ jobs:
       uses: actions/cache@v3
       with:
           path: /home/runner/work/syncode/syncode/cache/mask_stores/
-          key: files-${{ hashFiles('syncode/parsers/grammars/python_grammar.lark', 'syncode/dfa_mask_store.py') }}
+          key: files-${{ hashFiles('syncode/parsers/grammars/python.lark', 'syncode/dfa_mask_store.py') }}
     - name: Run Tests
       run: |
           python3 -m unittest tests.test_misc
 
@@ -69,7 +69,7 @@ SynCode depends on HuggingFace [transformers](https://github.com/huggingface/tra
 
 | SynCode version | Required transformers version | Python version |
 | -------------- | ----------------------------- | -------------- |
-| `v0.4.10` (latest) | `v4.44.0`                    | 3.6 - 3.12     |
+| `v0.4.11` (latest) | `v4.51.0`                    | 3.6 - 3.12     |
 
 **Note:** Python 3.13 is not currently supported due to dependency constraints.
 
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "syncode"
-version="0.4.10"
+version="0.4.11"
 requires-python = ">=3.6,<3.13"
 description = "Grammar-guided code generation tool"
 readme = "README.md"
@@ -24,7 +24,7 @@ dependencies = [
     "regex==2023.8.8",
     "torch",
     "tqdm",
-    "transformers==4.44.0",
+    "transformers==4.51.0",
     "datasets",
     "jsonschema",
 ]
 
@@ -1,8 +1,9 @@
+accelerate
 fire
 interegular
 regex==2023.8.8
 torch
 tqdm
-transformers==4.44.0; python_version < "3.13"
+transformers==4.51.0; python_version < "3.13"
 datasets
 jsonschema
@@ -11,14 +11,14 @@
     "regex==2023.8.8",
     "torch",
     "tqdm",
-    "transformers==4.44.0",
+    "transformers==4.51.0",
     "datasets",
     "jsonschema"
 ]
 
 setuptools.setup(
     name="syncode",
-    version="0.4.10",
+    version="0.4.11",
     author="Shubham Ugare",
     author_email="shubhamugare@gmail.com",
     description="This package provides the tool for grammar augmented LLM generation.",
 
@@ -12,21 +12,32 @@
 
 
 def load_model(model_name, device, quantize, device_map = None):
+        torch_dtype = torch.bfloat16 if quantize else "auto"
+        device_map = device_map if device_map is not None else "auto"
+
+        attn_implementation = None
+        if "gemma-3" in model_name:
+            # This is due to the gemma-3 issue with SDPA implementation
+            # https://github.com/google-deepmind/gemma/issues/169
+            attn_implementation = "eager"
+            logging.info("Using slower \"eager\" attention implementation for gemma-3 due to issue with SDPA implementation")
+
         if model_name == 'test':
             model = AutoModelForCausalLM.from_pretrained('bigcode/tiny_starcoder_py').to(device)
         elif model_name == 'test-instruct':
             model = AutoModelForCausalLM.from_pretrained("rahuldshetty/tiny-starcoder-instruct")
         else:
             if device_map is not None:
-                if (quantize):
-                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval()
-                else:
-                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True, device_map = device_map).eval()
-            else:
-                if (quantize):
-                    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
-                else:
-                    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True).eval().to(device)
+                logging.info(f"Loading model {model_name} with device:{device}, device_map:{device_map}, torch_dtype:{torch_dtype}")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name, 
+                    torch_dtype=torch_dtype, 
+                    cache_dir=HF_CACHE, 
+                    token=HF_ACCESS_TOKEN, 
+                    trust_remote_code=True, 
+                    device_map = device_map,
+                    attn_implementation=attn_implementation
+                    ).eval()
         return model
 
 def load_tokenizer(model_name):
@@ -35,7 +46,12 @@ def load_tokenizer(model_name):
         elif model_name == 'test-instruct':
             tokenizer = AutoTokenizer.from_pretrained("rahuldshetty/tiny-starcoder-instruct")
         else:
-            tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=HF_CACHE, token=HF_ACCESS_TOKEN, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name, 
+                cache_dir=HF_CACHE, 
+                token=HF_ACCESS_TOKEN, 
+                trust_remote_code=True
+                )
         return tokenizer
 
 def get_output_path(model_name, grammar, dataset, num_samples, mode):
 
@@ -72,7 +72,10 @@ def run_eval_for_task(syncode, num_samples_per_task, problem, samples, pbar, tas
         else:
             problem["prompt"][0]['content'] = f"{problem['prompt'][0]['content']}\nOnly output JSON.\nJSON:\n"
 
-        prompt = syncode.model.tokenizer.apply_chat_template(problem["prompt"], tokenize = False)
+        if syncode.model.tokenizer.chat_template is not None:
+            prompt = syncode.model.tokenizer.apply_chat_template(problem["prompt"], tokenize = False)
+        else:
+            prompt = problem["prompt"][0]['content']
 
         batch_completions = syncode.model.generate_grammar_constrained_completion(prompt, num_samples_per_task)
         for completion_id, completion in enumerate(batch_completions):