import re from datasets import load_dataset, IterableDataset, concatenate_datasets import torch from transformers import AutoTokenizer import transformers import datetime def data_generator(dataset): for row in dataset: yield row start_model_loading = datetime.datetime.now() model = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model) pipeline = transformers.pipeline("text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map = "auto") end_model_loading = datetime.datetime.now() print(f'Model: {model} loaded successfully in {(end_model_loading-start_model_loading).total_seconds()} seconds.') original_dataset = load_dataset("codeparrot/github-code", streaming=True, split="train", licenses=["mit", "isc"], languages = ['C#'], filter_languages=True, trust_remote_code=True) BATCH_SIZE = 100 instance_number = 0 batch_dataset = [] is_dataset_created = False ss=0 for example in original_dataset: prompt = f""" <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a C# expert. Your task is to refine the C# code enclosed within tags [C#] and [/C#]. Refined code should be enclosed with tags [refined_C#] and [/refined_C#]. Summary of changes should be enclosed with [code_changes] and [/code_changes]. You do not do anything more than user asks you do it. You do not generate any additional text. <|eot_id|><|start_header_id|>user<|end_header_id|> You have the refine the code based on principles: 1. Class name: Refine C# class name by making sure: a. It is PascalCase. b. It is logical. By logical, I mean, it should be a noun and it should denote what it does. 2. Property name: Refine the properties in the class by making sure: a. Private properties have _ before their names and public properties are PascalCase. b. It is logical. By logical, I mean, there should be no unnecessary repetition. For example property name as 'empId' in class Employee is not clean as 'emp' is unnecessary. This is because the class name already sets the context. The correct name would be 'id'. But make sure that the property name makes sense. 3. Object name: Refine the objects instantiated by making sure: a. It is logical. By logical I mean, the names should be expressive. For example 'Employee e' is not as expressive 'Employee employee'. 4. Method name: Refine methods in the class by making sure: a. It is logical. By logical, I mean the name of method should express what it does. b. It is PascalCase. [C#] {example["code"]} [/C#]<|eot_id|><|start_header_id|>assistant<|end_header_id|>[refined_C#] """ no_of_retries = 0 should_try = True while should_try: sequences = pipeline(prompt,temperature=0.2, top_p=0.9, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=5000, repetition_penalty=1.1) output = "" for seq in sequences: output += seq["generated_text"] code_pattern = r'\[refined_C#\](.*?)\[/refined_C#\]' code_matches = re.findall(code_pattern, output, re.DOTALL) is_code_okay = False if len(code_matches) == 2: refined_code = code_matches[1] is_code_okay = True is_summary_okay = False summary_pattern = r'\[code_changes\](.*?)\[/code_changes\]' summary_matches = re.findall(summary_pattern, output, re.DOTALL) if len(summary_matches) == 2: summary = summary_matches[1] is_summary_okay = True if (is_code_okay and is_summary_okay): batch_dataset.append({'code': example["code"], 'refined code': refined_code, 'summary': summary}) instance_number += 1 no_of_retries += 1 if (is_code_okay and is_summary_okay) or no_of_retries == 5: should_try = False if instance_number == BATCH_SIZE: if not is_dataset_created: new_dataset = IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset}) new_dataset.push_to_hub('llama-3-1-refined-code') is_dataset_created = True print('Pushed data for first time') else: refined_code_dataset = load_dataset('llama-3-1-refined-code', streaming = True) new_dataset = concatenate_datasets(refined_code_dataset, IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset})) new_dataset.push_to_hub('llama-3-1-refined-code') print('Pushed data again') instance_number = 0 batch_dataset = [] ss += 1 if ss == 3: break