llama_finetuned_inference.py

from datasets import load_dataset, Dataset
import re
from unsloth import FastLanguageModel

def data_generator(dataset):
    for row in dataset:
        yield row

max_seq_length = 32768 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "atharva2721/llama_finetuned_model",
    #model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

reference_dataset = load_dataset("atharva2721/qwen_inference_output", split="train", trust_remote_code=True)
inference_output = []
code_no = 0
inferred_no = 0
failed_no = 0

for example in reference_dataset:
    code_no += 1
    content = f'''
    Refine the C# code enclosed within tags [C#] and [/C#]. 
    Provide the refined code enclosed within tags [refined_C#] and [/refined_C#]
    The summary of changes must be enclosed within tags [code_changes] and [/code_changes].
    
    [C#]
    {example["code"]}
    [/C#]
    '''
    messages = [
        {"role": "user", "content": content},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    should_retry = True
    retry_no = 0
    while should_retry:
        output_tensor = model.generate(input_ids = inputs,
                        max_length  = max_seq_length,
                        temperature = 0.6
                        )
        
        decoded = tokenizer.batch_decode(output_tensor)

        output = ""
        for text in decoded:
            output += text
        print(output)
        output = output.split('<|im_start|>assistant')
        if len(output) == 2:
            output = output[1]
            
            code_pattern = r'\[refined_C#\](.*?)\[/refined_C#\]'
            summary_pattern = r'\[code_changes\](.*?)\[/code_changes\]'
            
            code_matches = re.search(code_pattern, output, re.DOTALL)
            summary_matches = re.search(summary_pattern, output, re.DOTALL)

            if code_matches and summary_matches:
                refined_code = code_matches.group(1)
                summary = summary_matches.group(1)
                inference_output.append({'code': example["code"], 'base inference':refined_code, 'base summary': summary,'finetuned inference': example["finetuned inference"], 'finetuned summary': example["finetuned summary"], 'reference inference': example["reference inference"], 'reference summary': example["reference summary"]})
                print(f'Code no. {code_no} refined successfully', flush=True)
                should_retry = False
                inferred_no += 1
            
        if retry_no == 2 and should_retry:
            should_retry = False
            print(f'Failed to refine code at {code_no}. Final try output: \n [failed_output]{output}[/failed_output]', flush=True)
            failed_no +=1

        retry_no += 1
    if code_no == 5:
        break

# new_dataset = Dataset.from_generator(data_generator, gen_kwargs={"dataset": inference_output})
# new_dataset.push_to_hub('llama_inference_output')
# print(f'Created and pushed total of {inferred_no} examples from total of {code_no} codes. Total failed inferences are {failed_no}', flush=True)