bageldotcom · mdvillagra · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -277,3 +277,6 @@ python/examples/launch/hello_world/fedml_job_entry_pack.bat
 **mpi_host_file
 /python/fedml/workflow/driver_example/customized_job_example/train_job/bootstrap.bat
 /python/fedml/workflow/driver_example/customized_job_example/train_job/fedml_job_entry_pack.bat
+
+
+venv
diff --git a/python/fedml/cross_silo/server/fedml_server_manager.py b/python/fedml/cross_silo/server/fedml_server_manager.py
@@ -246,7 +246,22 @@ def handle_message_receive_model_from_client(self, msg_params):
             if self.is_main_process():
                 mlops.log_aggregated_model_info(self.args.round_idx, model_url=global_model_url)
 
-            logging.info("\n\n==========end {}-th round training===========\n".format(self.args.round_idx))
+            # --------------------------------------------------
+            # Log global-update frequency in wall-clock terms
+            # --------------------------------------------------
+            current_ts = time.time()
+            # Compute and print only if this is not the very first round
+            if hasattr(self, "_last_round_end_ts") and self._last_round_end_ts is not None:
+                delta = current_ts - self._last_round_end_ts
+                if delta > 0:
+                    freq = 1.0 / delta
+                    logging.info(
+                        f"Global update frequency: {freq:.4f} updates/sec ({delta:.2f} s per round)"
+                    )
+            # Record timestamp for the next round
+            self._last_round_end_ts = current_ts
+
+            logging.info("\n\n==========end {}/{}-th round training===========\n".format(self.args.round_idx, self.round_num))
             if self.args.round_idx < self.round_num:
                 mlops.event("server.wait", event_started=True, event_value=str(self.args.round_idx))
 

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
diff --git a/python/spotlight_prj/fedllm/data_formatting.py b/python/spotlight_prj/fedllm/data_formatting.py
@@ -0,0 +1,158 @@
+
+from datasets import load_dataset
+
+
+class DataFormatting:
+
+    def __init__(self):
+
+        self.system_prompt  = """
+
+        Respond in the following format:
+
+        <reasoning>
+        ...
+        </reasoning>
+
+        <answer>
+
+        ...
+        </answer>
+
+        """
+
+
+
+    def extract_answer_from_model_output(self, text):
+
+        """
+        Extracts the value from the last <answer> tag in the text.
+
+        Args:
+            text (str): The model generated containing XML-style <answer> tags. 
+
+        Returs:
+            str or None: The content inside the <answer> tags, or None if no valid answer is found 
+
+        Explanation: 
+            1. Splits the text on the <answer> tag to isolate content after the tag.
+            2. Checks if at least one <answer> tag exists in the text. 
+            3. For the last <answer> segment:
+                - Verifies it contains a closing </answer>
+                - Extracts only the content between the tags.
+            4. Returns None if the answer is empty (just "...") or if tags are missing
+        """
+
+
+        #split on <answer> and take everything after the last occurane.
+        parts = text.split("<answer>")
+
+        if len(parts)<2: # No <answer> tag found
+
+            return None
+
+        last_part = parts[-1]
+
+        #Extract the content up to </answer>
+
+        if "</answer>" not in last_part:
+            return None
+
+        answer = last_part.split("</answer>")[0].strip()
+
+        return None if answer =="..." else answer
+
+
+    def extract_answer_from_dataset(self, text):
+
+        """
+        Extracts the answer from the GSM8K dataset examples.
+
+        Args:
+            text(str): The dataset example text containing a question and answer
+
+        Returns:
+            str or None: The extracted answer part after the '####' delimiter, or None
+
+
+        Explanation: 
+
+        1. Checks if the text contains the '####' delimiter that separates questions from answers
+        2. If found, splits the text at this delimiter and returns the second part 
+        3. The answer is stripped of leading or trailing white spaces. 
+        4. Returns None if no delimiter is present. 
+
+        """
+
+        if "####" not in text:
+            return None
+
+        return text.split("####")[1].strip()
+
+
+
+    def prepare_dataset(self, split="train"):
+
+        """
+        Load and prepare GSM8K dataset for training with string prompts.
+
+        Args:
+            split(str): The dataset split to load("train" or "test"), Defaults to "train"
+
+        Returns:
+            list: A list of formatted examples, each containing a prompt string and the role
+
+        Explanation:
+            1. Loads GSM8K dataset from Hugging Face dataset hub.
+            2. For each example in the dataset:
+                - Creates a list of messages with system prompt and the question.
+                - Converts this list into a single string prompt using build_prompt()
+                - Extracts the answer from the dataset example. 
+                - Creates a list of formatted examples with prompt and answer. 
+            3. Returns the list of formatted examples ready for model training or evaluation. 
+        """
+
+        data = load_dataset('openai/gsm8k', 'main')[split]
+
+        formatted_data = []
+
+        for example in data:
+
+            # convert the list of messages to a single string prompt
+
+            prompt_str = self.build_prompt([
+                {"role": "system", "content": self.system_prompt},
+                {"role":"user", "content": example["question"]}
+            ])
+
+
+            formatted_example = {
+                "prompt":prompt_str, # string rather than a list
+                "answer": self.extract_answer_from_dataset(example["answer"])
+            }
+            formatted_data.append(formatted_example)
+
+        return formatted_data
+
+
+
+    def build_prompt(self,messages):
+
+        """
+        Build a single prompt string from a list of messages.
+
+        Args:
+            messages(list): A list of message dictionaries, each with 'role' and 'content'
+
+        Returns:
+            str: A concatenated string of all message content.
+
+        Explanation:
+            1. Takes a list of message dictionaries in typical chat format. 
+            2. Extracts the 'content' field from each message and strips whitespace. 
+            3. Joins all content strings with newlines to create a single prompt. 
+            4. This preserves the training format while converting from structures messages. 
+       """
+
+        return "\n".join(msg["content"].strip() for msg in messages)
+
diff --git a/python/spotlight_prj/fedllm/evaluation.py b/python/spotlight_prj/fedllm/evaluation.py
@@ -0,0 +1,206 @@
+
+import re
+
+import torch
+from data_formatting import DataFormatting
+
+
+class Evaluation:
+
+    def __init__(self):
+        self.dat_fmt = DataFormatting()
+
+
+
+
+    def extract_last_number(self, text):
+
+        """
+        Extracts the last number appearing in the text
+
+        Args:
+            text (str): The text to extract a number from.
+
+        Returns:
+            float or None: The last number in the text, or None if no number is found
+
+
+        Explanation:
+            1. Removes dollar signs and percentage symbols from text. 
+            2. Users regex to find a number that appeares at the end of the text. 
+            3. The pattern matches numbers that appear at the end of the string. 
+            4 Return the found number as float, or None if no match is found. 
+        """
+
+        text = text.replace('$', '').replace('%','')
+
+        pattern = r'(?:^|\s|=)\s*(-?\d*\.?\d+)\s*$'
+
+        match = re.search(pattern, text)
+
+        return float(match.group(1)) if match else None
+
+
+
+
+    def extract_single_number(self, text):
+
+        """
+        Extracts a single number from text if exactly one number is present.
+
+        Args:
+            text (str): The text to extract number from. 
+
+        Returns:
+            float or None: The single number in the text, or None if zero or multiple numbers. 
+
+        Explanation:
+            1. Uses regex to find all numbers in the text including the negative numebers.
+            2. If exactly one number if found, returns it as float. 
+            3. If zero or multiple numbers are found, returns None.
+
+        """
+
+        numbers =re.findall(r'-?\d*\.?\d+', text)
+        #print("NUMBERS ARE:::", numbers)
+
+        if len(numbers)==0:
+            return None
+        elif len(numbers)==1:
+            return float(numbers[0])
+
+        else:
+            return  None
+
+
+
+    def evaluate_model(self, model, tokenizer, eval_samples, device):
+
+        """
+        Evaluates the  model on a set of examples and prints detailed results. 
+
+        Args:
+            model: The language model to evaluate. 
+            tokenizer: The tokenizer for encoding inputs and decoding outputs. 
+            eval_samples (list): List of evaluation examples each containing "prompt" and "answer"
+            device: The device (CPU or GPU) to run evaluation on 
+
+        Return:
+            float: The accuracy percentage (correct predictions / total examples * 100)
+
+
+        Explanation:
+            1. Sets the model to evaluation mode. 
+            2. For each example in the evaluation set:
+                - Encodes the prompt and generates a respnse using the model
+                - Extracts the predicted answer from the generated response
+                - Compares the predicted answer with the expected answer using multiple methods
+
+                    a. Extract string matching
+                    b. Single number extraction and comparion.
+                    c. Last number extraction and comparison
+                -Prints detailed information about each example
+            3. Calculates and returns the overall accuracy. 
+            4. Returns the model to training mode. 
+
+        """
+
+
+        model.eval()
+
+        correct = 0
+
+        total = len(eval_samples)
+
+        print("\n" + "="*50)
+        print("EVALUATION ON", total, "EXAMPLES")
+        print("="*50)
+
+
+        for example in eval_samples:
+
+            #get the prompt and expected answer
+
+            full_prompt = example["prompt"]
+            expected = example["answer"]
+
+            #Tokenize and generate response
+
+            inputs = tokenizer(full_prompt, return_tensors='pt', padding=False, truncation=False, return_attention_mask=True).to(device)
+
+            with torch.no_grad():
+
+                outputs = model.generate(
+                    input_ids = inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    max_new_tokens=512,
+                    temperature=0.7,
+                    num_return_sequences=1,
+                    pad_token_id = tokenizer.pad_token_id,
+                    eos_token_id = tokenizer.eos_token_id,
+                    forced_eos_token_id = tokenizer.eos_token_id,
+                    early_stopping = False,
+                )
+
+                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+                try:
+                    #Extract answers and check correctness
+                    predicted = self.dat_fmt.extract_answer_from_model_output(response)
+
+                    #Try different matching method
+
+                    if predicted == expected : # Exact match
+
+                        is_correct = True
+
+                    else:
+                        # Try single number matchin
+                        pred_num = self.extract_single_number(str(predicted))
+                        exp_num = self.extract_single_number(str(expected))
+
+                        if pred_num is not None and exp_num is not None and pred_num==exp_num:
+
+                            is_correct = True
+                        else:
+                            #Try the last number matchin
+                            pre_num = self.extract_last_number(str(predicted))
+                            exp_num = self.extract_last_number(str(expected))
+
+                            is_correct = (pred_num is not None and exp_num is not None and pred_num == exp_num)
+
+                    if is_correct:
+                        correct+=1
+
+
+                    # Print evaluation results
+
+                    print("\nPrompt:")
+                    print(full_prompt)
+                    print("\nExpected Answer:")
+                    print(expected)
+                    print("\nExtracted Answer:")
+                    print(predicted)
+                    print("\nFull Generated Response:")
+                    print(response)
+                    print("\nCorrect:", "✓" if is_correct else "✗")
+                    print("--"*50)
+
+                except Exception as e:
+
+                    print("\nFailed to parse the model output from prompt:")
+                    print(full_prompt)
+                    print('Error:',e)
+                    print('-'*50)
+
+
+        accuracy = (correct / total) * 100
+
+        print(f"\nAccuracy: {accuracy:.2f}% ({correct}/{total})" )
+
+        # return the model to training mode
+        model.train()
+
+        return accuracy
+
+