Adjust fine-tuning and other scripts

bf928bca · Atharva Jadhav · 88b90ea7 · bf928bca · bf928bca · bf928bca
Commit bf928bca authored 1 month ago by Atharva Jadhav
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,6 @@ data_generation/logs
 fine-tuning/logs
 fine-tuning/models
 fine-tuning/outputs
-fine-tuning/wandb
\ No newline at end of file
+fine-tuning/wandb
+data_standardization/logs
+inference/logs
\ No newline at end of file
--- a/data_standardization/dataset_standardization.py
+++ b/data_standardization/dataset_standardization.py
@@ -12,10 +12,7 @@ def format_to_conversations(examples):
  refined_codes = examples["refined code"]
  summaries = examples["summary"]
  for i in range(len(refined_codes)):
-      user_content = f'''Refine the C# code enclosed within tags [C#] and [/C#]. Return the refined code should be enclosed with tags [refined_C#] and [/refined_C#].
-      Summary of changes should be enclosed with [code_changes] and [/code_changes].
-      You do not do anything more than user asks you do it.
-      You do not generate any additional text.
+      user_content = f'''Refine the C# code enclosed within tags [C#] and [/C#].

      [C#]
      {codes[i]}
@@ -52,10 +49,10 @@ model, tokenizer = FastLanguageModel.from_pretrained(
    load_in_4bit = load_in_4bit,
 )

-dataset = load_dataset("atharva2721/qwen-refined-code", split = "train")
+dataset = load_dataset("atharva2721/refined-test-aggregated", split = "train")

 dataset = dataset.map(format_to_conversations, batched = True,)
 dataset = dataset.map(formatting_prompts_func, batched = True,)

-dataset.push_to_hub('csharp-qwen-standardized')
+dataset.push_to_hub('standardized-refined-test-aggregated')
 print('Dataset pushed to hub')
\ No newline at end of file
--- a/data_standardization/slurm_data_standardization.sh
+++ b/data_standardization/slurm_data_standardization.sh
@@ -2,9 +2,9 @@

 ### Add basic configuration for job

-#SBATCH --job-name=fine_tuning
-#SBATCH --output=logs/fine_tuning_%j.log
-#SBATCH --error=logs/fine_tuning_error_%j.log
+#SBATCH --job-name=dataset_standardization
+#SBATCH --output=logs/dataset_standardization_%j.log
+#SBATCH --error=logs/dataset_standardization_error_%j.log
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=5
@@ -17,7 +17,7 @@
 ### Run the project in work directory of the cluster (configure based on need!! 
 ### RWTH File System : https://help.itc.rwth-aachen.de/en/service/rhr4fjjutttf/article/da307ec2c60940b29bd42ac483fc3ea7/
 cd $HPCWORK
-cd codebud/fine-tuning
+cd codebud/data_standardization
 ###------------------------------------------------------------------------------------------------------------------------------

 ### JOB SCRIPT RUN
@@ -32,7 +32,6 @@ python --version

 python dataset_standardization.py

-
 module unload CUDA
 module unload Python/3.11.5


--- a/fine-tuning/main_fine_tuning.py
+++ b/fine-tuning/main_fine_tuning.py
@@ -22,10 +22,10 @@ print(f'Model loaded successfully at {datetime.datetime.now()}', flush=True)

 model = FastLanguageModel.get_peft_model(
    model,
-    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
-    lora_alpha = 16,
+    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
@@ -40,8 +40,8 @@ tokenizer = get_chat_template(
    chat_template = "qwen-2.5",
 )

-
-dataset = load_dataset("atharva2721/csharp-qwen-standardized", split = "train")
+dataset = load_dataset("atharva2721/standardized-refined-train-aggregated", split = "train")
+validation_dataset = load_dataset("atharva2721/standardized-refined-val-aggregated", split = "train")

 wandb.init(project="codebud")

@@ -49,6 +49,7 @@ trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
+    eval_dataset=validation_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
@@ -57,19 +58,26 @@ trainer = SFTTrainer(
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Fixed major bug in latest Unsloth
-        warmup_steps = 5,
-        num_train_epochs = 1, # Set this for 1 full training run.
+        warmup_ratio = 0.1,
+        num_train_epochs = 3, # Set this for 1 full training run.
        #max_steps = 60,
-        learning_rate = 2e-4,
+        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
-        logging_steps = 1,
+        eval_strategy="steps",
+        eval_steps=656,
+        fp16_full_eval = not is_bfloat16_supported(),
+        bf16_full_eval = is_bfloat16_supported(),
+        logging_steps = 10,
+        save_steps = 656,
        optim = "paged_adamw_8bit", # Save more memory
        weight_decay = 0.01,
-        lr_scheduler_type = "linear",
+        lr_scheduler_type = "cosine",
        seed = 3407,
+        remove_unused_columns=False,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
+        run_name = "run-name"
    ),
 )

@@ -93,7 +101,6 @@ print(f"{start_gpu_memory} GB of memory reserved.", flush=True)
 print(f'Everything initialized. Starting the training at {datetime.datetime.now()}', flush=True)

 trainer_stats = trainer.train()
-
 print(f'Successfully completed training at {datetime.datetime.now()}', flush=True)

 used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
@@ -110,10 +117,10 @@ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
 print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

 print(f'Pushing model and tokenizer at {datetime.datetime.now()}', flush=True)
-model.save_pretrained("models/first_finetuned_model_one_epochs")  # Local saving
-tokenizer.save_pretrained("models/first_finetuned_model_one_epochs")
-model.push_to_hub("first_finetuned_model_one_epochs") # Online saving
-tokenizer.push_to_hub("first_finetuned_model_one_epochs") # Online saving
+model.save_pretrained("models/finetuned_model_with_eval")  # Local saving
+tokenizer.save_pretrained("models/finetuned_model_with_eval")
+model.push_to_hub("finetuned_model_with_eval") # Online saving
+tokenizer.push_to_hub("finetuned_model_with_eval") # Online saving

 wandb.finish()
 print(f'Run complete at {datetime.datetime.now()}', flush=True)
\ No newline at end of file
--- a/fine-tuning/slurm_finetune.sh
+++ b/fine-tuning/slurm_finetune.sh
@@ -9,7 +9,7 @@
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=5
 #SBATCH --gres=gpu:1
-#SBATCH --time=00:30:00
+#SBATCH --time=12:00:00


 ###------------------------------------------------------------------------------------------------------------------------------
@@ -29,9 +29,8 @@ source ../../venvs/codebud/bin/activate
 echo $VIRTUAL_ENV

 python --version
-#python main_fine_tuning.py
-#python dataset_standardization.py
-python inference.py
+
+python main_fine_tuning.py

 module unload CUDA
 module unload Python/3.11.5

--- a/inference/inference.py
+++ b/inference/inference.py
@@ -13,13 +13,10 @@ model, tokenizer = FastLanguageModel.from_pretrained(
 FastLanguageModel.for_inference(model) # Enable native 2x faster inference

 code = """
-using System; using System.IO; using System.ServiceProcess; using InEngine.Core; //using Mono.Unix; //using Mono.Unix.Native; namespace InEngine { class Program { public const string ServiceName = "InEngine.NET"; public static ServerHost ServerHost { get; set; } static void Main(string[] args) { /* * Set current working directory as services use the system directory by default. * Also, maybe run from the CLI from a different directory than the application root. */ Directory.SetCurrentDirectory(AppDomain.CurrentDomain.BaseDirectory); new ArgumentInterpreter().Interpret(args); } /// <summary> /// Start the server as a service or as a CLI program in the foreground. /// </summary> public static void RunServer() { var settings = InEngineSettings.Make(); ServerHost = new ServerHost() { MailSettings = settings.Mail, QueueSettings = settings.Queue, }; if (!Environment.UserInteractive && Type.GetType("Mono.Runtime") == null) { using (var service = new Service()) ServiceBase.Run(service); } else { ServerHost.Start(); Console.WriteLine("Press any key to exit..."); Console.ReadLine(); ServerHost.Dispose(); } } static void Start(string[] args) { ServerHost.Start(); } static void Stop() { ServerHost.Dispose(); } public class Service : ServiceBase { public Service() { ServiceName = Program.ServiceName; } protected override void OnStart(string[] args) { Start(args); } protected override void OnStop() { Stop(); } } } }
+
 """
 content = f'''
-Refine the C# code enclosed within tags [C#] and [/C#]. Return the refined code should be enclosed with tags [refined_C#] and [/refined_C#].
-Summary of changes should be enclosed with [code_changes] and [/code_changes].
-You do not do anything more than user asks you do it.
-You do not generate any additional text.
+Refine the C# code enclosed within tags [C#] and [/C#].

 [C#]
 {code}