structuredllm
diff --git a/‎.github/workflows/run_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/run_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 2 additions & 2 deletions b/‎setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎syncode/evaluation/json_eval.py‎
Lines changed: 4 additions & 1 deletion b/‎syncode/evaluation/json_eval.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎syncode/language_model.py‎
Lines changed: 156 additions & 36 deletions b/‎syncode/language_model.py‎
Lines changed: 156 additions & 36 deletions
diff --git a/‎syncode/parsers/grammars/c_grammar.lark‎ ‎syncode/parsers/grammars/c.lark‎syncode/parsers/grammars/c_grammar.lark renamed to syncode/parsers/grammars/c.lark b/‎syncode/parsers/grammars/c_grammar.lark‎ ‎syncode/parsers/grammars/c.lark‎syncode/parsers/grammars/c_grammar.lark renamed to syncode/parsers/grammars/c.lark
diff --git a/‎…ncode/parsers/grammars/calc_grammar.lark‎ ‎syncode/parsers/grammars/calc.lark‎syncode/parsers/grammars/calc_grammar.lark renamed to syncode/parsers/grammars/calc.lark b/‎…ncode/parsers/grammars/calc_grammar.lark‎ ‎syncode/parsers/grammars/calc.lark‎syncode/parsers/grammars/calc_grammar.lark renamed to syncode/parsers/grammars/calc.lark
diff --git a/‎syncode/parsers/grammars/go_grammar.lark‎ ‎syncode/parsers/grammars/go.lark‎syncode/parsers/grammars/go_grammar.lark renamed to syncode/parsers/grammars/go.lark b/‎syncode/parsers/grammars/go_grammar.lark‎ ‎syncode/parsers/grammars/go.lark‎syncode/parsers/grammars/go_grammar.lark renamed to syncode/parsers/grammars/go.lark
@@ -27,7 +27,7 @@ jobs:
       uses: actions/cache@v3
       with:
           path: /home/runner/work/syncode/syncode/cache/mask_stores/
-          key: files-${{ hashFiles('syncode/parsers/grammars/python_grammar.lark', 'syncode/dfa_mask_store.py') }}
+          key: files-${{ hashFiles('syncode/parsers/grammars/python.lark', 'syncode/dfa_mask_store.py') }}
     - name: Run Tests
       run: |
           python3 -m unittest tests.test_misc
 
@@ -69,7 +69,7 @@ SynCode depends on HuggingFace [transformers](https://github.com/huggingface/tra
 
 | SynCode version | Required transformers version | Python version |
 | -------------- | ----------------------------- | -------------- |
-| `v0.4.10` (latest) | `v4.44.0`                    | 3.6 - 3.12     |
+| `v0.4.11` (latest) | `v4.51.0`                    | 3.6 - 3.12     |
 
 **Note:** Python 3.13 is not currently supported due to dependency constraints.
 
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "syncode"
-version="0.4.10"
+version="0.4.11"
 requires-python = ">=3.6,<3.13"
 description = "Grammar-guided code generation tool"
 readme = "README.md"
@@ -24,7 +24,7 @@ dependencies = [
     "regex==2023.8.8",
     "torch",
     "tqdm",
-    "transformers==4.44.0",
+    "transformers==4.51.0",
     "datasets",
     "jsonschema",
 ]
 
@@ -3,6 +3,6 @@ interegular
 regex==2023.8.8
 torch
 tqdm
-transformers==4.44.0; python_version < "3.13"
+transformers==4.51.0; python_version < "3.13"
 datasets
 jsonschema
@@ -11,14 +11,14 @@
     "regex==2023.8.8",
     "torch",
     "tqdm",
-    "transformers==4.44.0",
+    "transformers==4.51.0",
     "datasets",
     "jsonschema"
 ]
 
 setuptools.setup(
     name="syncode",
-    version="0.4.10",
+    version="0.4.11",
     author="Shubham Ugare",
     author_email="shubhamugare@gmail.com",
     description="This package provides the tool for grammar augmented LLM generation.",
 
@@ -72,7 +72,10 @@ def run_eval_for_task(syncode, num_samples_per_task, problem, samples, pbar, tas
         else:
             problem["prompt"][0]['content'] = f"{problem['prompt'][0]['content']}\nOnly output JSON.\nJSON:\n"
 
-        prompt = syncode.model.tokenizer.apply_chat_template(problem["prompt"], tokenize = False)
+        if syncode.model.tokenizer.chat_template is not None:
+            prompt = syncode.model.tokenizer.apply_chat_template(problem["prompt"], tokenize = False)
+        else:
+            prompt = problem["prompt"][0]['content']
 
         batch_completions = syncode.model.generate_grammar_constrained_completion(prompt, num_samples_per_task)
         for completion_id, completion in enumerate(batch_completions):
 
@@ -1,15 +1,19 @@
-from ast import Tuple
+from ast import Dict, Tuple
 import time
 import torch
 import syncode.common as common
 from syncode.grammar_mask.logits_processor import SyncodeLogitsProcessor
 from transformers import LogitsProcessorList, StoppingCriteriaList, StoppingCriteria, PreTrainedModel
 from syncode.parsers.grammars import Grammar
-from syncode.utils.generation import filter_code, fix_indents
-from typing import Callable, Iterable, Union
+from typing import Any, Callable, Iterable, Union
 from transformers.generation.utils import GenerationMode
 from transformers.generation.configuration_utils import GenerationConfig
-
+from transformers.generation.logits_process import (
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+from transformers.cache_utils import Cache
 
 class KeywordsStoppingCriteria(StoppingCriteria):
     '''
@@ -189,34 +193,50 @@ def _generate(
         """
         We support greedy search and sampling for batch size 1 otherwise we use the generate function from transformers library.
         """
-        token_ids, attention_mask, past_key_values = inputs['input_ids'], inputs['attention_mask'], None
-        
+
+        # Get the input ids and attention mask
+        token_ids = inputs['input_ids']
+        model_kwargs = {}
+        model_kwargs['attention_mask'] = inputs['attention_mask']
+        model_kwargs['use_cache'] = True
+        model_kwargs = self._get_initial_cache_position(token_ids, model_kwargs)
+
         # This does not include grammar decoder
-        self.model._prepare_special_tokens(gen_config, False, device=self.device)
+        self.model._prepare_special_tokens(gen_config, True, device=self.device)
 
         # Add logits processor for generation parameters such as top_k, top_p, temperature, etc.
-        logits_processor = self.model._get_logits_warper(gen_config, self.device)
+        logits_processor = self._get_logits_processors(gen_config)
 
         max_tokens = self.gen_args['max_new_tokens']+token_ids.size(1)
-        self.model.config.pad_token_id = pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+        self.model.config.pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+
+        # Prepare the cache. (This is copied from the transformers generation_utils.py)
+        # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `gen_config`.
+        # - different models have a different cache name expected by the model (default = "past_key_values")
+        # - `max_length`, prepared above, is used to determine the maximum cache length
+        max_cache_length = max_tokens-1
+        self.model._prepare_cache_for_generation(
+            gen_config, 
+            model_kwargs, 
+            assistant_model=None, 
+            batch_size=token_ids.shape[0], 
+            max_cache_length=max_cache_length, 
+            device=self.device
+        )
 
         while True:
+            model_inputs = self.model.prepare_inputs_for_generation(token_ids, **model_kwargs)
             try:
-                if past_key_values: # Get the last token if kv is cached for all previous tokens
-                    input_ids = token_ids[..., -1].unsqueeze(-1) 
-                else:
-                    input_ids = token_ids
-
-                outputs = self.model(
-                    input_ids, 
-                    attention_mask=attention_mask, 
-                    past_key_values=past_key_values
-                    )                
+                outputs = self.model(**model_inputs, return_dict=True)              
             except IndexError as e:  
                 raise ValueError(f"The input length exceeds the context length of the model. {e}")
 
-            next_token_scores, past_key_values = outputs.logits[:, -1, :], outputs.past_key_values
+            model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs)
 
+            # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            next_token_scores = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=token_ids.device)
+
             if grammar_decoder is not None:
                 next_token = self._get_next_token(gen_mode, token_ids, logits_processor, next_token_scores)
                 is_valid = grammar_decoder.is_valid(token_ids, next_token)
@@ -240,12 +260,6 @@ def _generate(
             if finish_generation or next_token == self.tokenizer.eos_token_id or token_ids.size(1) >= max_tokens:
                 break
 
-            # Update attention mask
-            attention_mask = torch.cat([attention_mask, torch.ones((attention_mask.size(0), 1), dtype=attention_mask.dtype).to(self.device)], dim=-1)
-        
-        if debug:
-            grammar_decoder.print_debug()
-
         return token_ids
 
     def _get_next_token(self, gen_mode, token_ids, logits_processor, next_token_scores):
@@ -258,34 +272,140 @@ def _get_next_token(self, gen_mode, token_ids, logits_processor, next_token_scor
         return next_token
 
     def _get_generation_mode(
-        self, generation_config: GenerationConfig
+        self, gen_config: GenerationConfig
     ) -> GenerationMode:
         """
         Returns the generation mode triggered by a [`GenerationConfig`] instance.
         """
-        if generation_config.constraints is not None or generation_config.force_words_ids is not None:
+        if gen_config.constraints is not None or gen_config.force_words_ids is not None:
             generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
-        elif generation_config.num_beams == 1:
-            if generation_config.do_sample is False:
+        elif gen_config.num_beams == 1:
+            if gen_config.do_sample is False:
                 if (
-                    generation_config.top_k is not None
-                    and generation_config.top_k > 1
-                    and generation_config.penalty_alpha is not None
-                    and generation_config.penalty_alpha > 0
+                    gen_config.top_k is not None
+                    and gen_config.top_k > 1
+                    and gen_config.penalty_alpha is not None
+                    and gen_config.penalty_alpha > 0
                 ):
                     generation_mode = GenerationMode.CONTRASTIVE_SEARCH
                 else:
                     generation_mode = GenerationMode.GREEDY_SEARCH
             else:
                 generation_mode = GenerationMode.SAMPLE
         else:
-            if generation_config.num_beam_groups > 1:
+            if gen_config.num_beam_groups > 1:
                 generation_mode = GenerationMode.GROUP_BEAM_SEARCH
-            elif generation_config.do_sample is True:
+            elif gen_config.do_sample is True:
                 generation_mode = GenerationMode.BEAM_SAMPLE
             else:
                 generation_mode = GenerationMode.BEAM_SEARCH
         return generation_mode
 
     def tokenize(self, s: str) -> 'Iterable[int]':
         return self.tokenizer.encode(s, add_special_tokens=False)
+
+    def _get_logits_processors(self, gen_config: GenerationConfig) -> LogitsProcessorList:
+        """
+        Returns a [`~transformers.generation.LogitsProcessorList`] with the appropriate [`LogitsProcessor`]s to use for
+        generation.
+        """
+        processors = LogitsProcessorList()
+        if gen_config.do_sample:
+            # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
+            # better score (i.e. keep len(list(gen_config._eos_token_tensor)) + 1)
+            if gen_config.num_beams > 1:
+                if isinstance(gen_config._eos_token_tensor, list):
+                    min_tokens_to_keep = len(gen_config._eos_token_tensor) + 1
+                elif isinstance(gen_config._eos_token_tensor, torch.Tensor):
+                    min_tokens_to_keep = gen_config._eos_token_tensor.shape[0] + 1
+                else:
+                    min_tokens_to_keep = 2
+            else:
+                min_tokens_to_keep = 1
+
+            # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+            # all samplers can be found in `generation_utils_samplers.py`
+            if gen_config.temperature is not None and gen_config.temperature != 1.0:
+                processors.append(TemperatureLogitsWarper(gen_config.temperature))
+            if gen_config.top_k is not None and gen_config.top_k != 0:
+                processors.append(
+                    TopKLogitsWarper(top_k=gen_config.top_k, min_tokens_to_keep=min_tokens_to_keep)
+                )
+            if gen_config.top_p is not None and gen_config.top_p < 1.0:
+                processors.append(
+                    TopPLogitsWarper(top_p=gen_config.top_p, min_tokens_to_keep=min_tokens_to_keep)
+                )
+        return processors
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs,
+        model_kwargs: dict[str, Any],
+    ) -> dict[str, Any]:
+        # Variable names used to hold the cache at generation time
+        ALL_CACHE_NAMES = [
+            "past_key_values",  # default
+            "cache_params",  # mamba-based models
+            "state",  # rwkv
+            "mems",  # xlnet
+            "past_buckets_states",  # reformer
+        ]
+
+        # update past_key_values keeping its naming used in model code
+        for possible_cache_name in ALL_CACHE_NAMES:
+            if possible_cache_name in outputs:
+                if possible_cache_name in ("past_buckets_states", "mems"):
+                    cache_name = "past_key_values"
+                else:
+                    cache_name = possible_cache_name
+                model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
+                break
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        # assuming is_encoder_decoder = False
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        if model_kwargs.get("use_cache", True):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1  # num_new_tokens = 1
+        else:
+            past_positions = model_kwargs.pop("cache_position")
+            new_positions = torch.arange(
+                past_positions[-1] + 1, past_positions[-1] + 2, dtype=past_positions.dtype
+            ).to(past_positions.device)
+            model_kwargs["cache_position"] = torch.cat((past_positions, new_positions))
+
+        return model_kwargs
+
+    def _get_initial_cache_position(self, input_ids, model_kwargs):
+        """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+        # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
+        if "inputs_embeds" in model_kwargs and not self.config.is_encoder_decoder:
+            cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
+        elif "decoder_inputs_embeds" in model_kwargs and self.config.is_encoder_decoder:
+            cache_position = (
+                torch.ones_like(model_kwargs["decoder_inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
+            )
+        else:
+            cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1
+
+        past_length = 0
+        if model_kwargs.get("past_key_values") is not None:
+            cache = model_kwargs["past_key_values"]
+            past_length = 0
+            if not isinstance(cache, Cache):
+                past_length = cache[0][0].shape[2]
+            elif hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None:
+                past_length = cache.get_seq_length()
+
+            cache_position = cache_position[past_length:]
+
+        model_kwargs["cache_position"] = cache_position
+        return model_kwargs