main.py

import re
from datasets import load_dataset, IterableDataset, concatenate_datasets
import torch
from transformers import AutoTokenizer
import transformers
import datetime


def data_generator(dataset):
    for row in dataset:
        yield row

start_model_loading = datetime.datetime.now()
model = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline("text-generation",
model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map = "auto")
end_model_loading = datetime.datetime.now()
print(f'Model: {model} loaded successfully in {(end_model_loading-start_model_loading).total_seconds()} seconds.')

original_dataset = load_dataset("codeparrot/github-code", streaming=True, split="train", licenses=["mit", "isc"], languages = ['C#'], filter_languages=True, trust_remote_code=True)

BATCH_SIZE = 100
instance_number = 0
batch_dataset = []
is_dataset_created = False
ss=0
for example in original_dataset:
    prompt = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a C# expert.
    Your task is to refine the C# code enclosed within tags [C#] and [/C#]. 
    Refined code should be enclosed with tags [refined_C#] and [/refined_C#].
    Summary of changes should be enclosed with [code_changes] and [/code_changes].
    You do not do anything more than user asks you do it.
    You do not generate any additional text.
    <|eot_id|><|start_header_id|>user<|end_header_id|>

    You have the refine the code based on principles:
    1. Class name: Refine C# class name by making sure:
        a. It is PascalCase.
        b. It is logical. By logical, I mean, it should be a noun and it should denote what it does.
    2. Property name: Refine the properties in the class by making sure:
        a. Private properties have _ before their names and public properties are PascalCase.
        b. It is logical. By logical, I mean, there should be no unnecessary repetition. For example property name as 'empId' in class Employee is not clean as 'emp' is unnecessary. This is because the class name already sets the context. The correct name would be 'id'. But make sure that the property name makes sense.
    3. Object name: Refine the objects instantiated by making sure:
        a. It is logical. By logical I mean, the names should be expressive. For example 'Employee e' is not as expressive 'Employee employee'.
    4. Method name: Refine methods in the class by making sure:
        a. It is logical. By logical, I mean the name of method should express what it does. 
        b. It is PascalCase.
    [C#]
    {example["code"]}
    [/C#]<|eot_id|><|start_header_id|>assistant<|end_header_id|>[refined_C#]
    """

    no_of_retries = 0
    should_try = True
    while should_try:
        sequences = pipeline(prompt,temperature=0.2,
            top_p=0.9,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            max_length=5000,
            repetition_penalty=1.1)
        
        output = ""
        for seq in sequences:
            output += seq["generated_text"]

        code_pattern = r'\[refined_C#\](.*?)\[/refined_C#\]'
        code_matches = re.findall(code_pattern, output, re.DOTALL)
        is_code_okay = False
        if len(code_matches) == 2:
            refined_code = code_matches[1]
            is_code_okay = True


        is_summary_okay = False
        summary_pattern = r'\[code_changes\](.*?)\[/code_changes\]'
        summary_matches = re.findall(summary_pattern, output, re.DOTALL)
        if len(summary_matches) == 2:
            summary = summary_matches[1]
            is_summary_okay = True

        if (is_code_okay and is_summary_okay):
            batch_dataset.append({'code': example["code"], 'refined code': refined_code, 'summary': summary})
            instance_number += 1

        no_of_retries += 1
        if (is_code_okay and is_summary_okay) or no_of_retries == 5:
            should_try = False
    
    if instance_number == BATCH_SIZE:
        if not is_dataset_created:
            new_dataset = IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset})
            new_dataset.push_to_hub('llama-3-1-refined-code')
            is_dataset_created = True
            print('Pushed data for first time')
        else:
            refined_code_dataset = load_dataset('llama-3-1-refined-code', streaming = True)
            new_dataset = concatenate_datasets(refined_code_dataset, IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset}))
            new_dataset.push_to_hub('llama-3-1-refined-code')
            print('Pushed data again')

        instance_number = 0
        batch_dataset = []
        ss += 1

    if ss == 3:
        break