Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
from datasets import load_dataset, IterableDataset, concatenate_datasets
import torch
from transformers import AutoTokenizer
import transformers
import datetime
def data_generator(dataset):
for row in dataset:
yield row
start_model_loading = datetime.datetime.now()
model = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline("text-generation",
model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map = "auto")
end_model_loading = datetime.datetime.now()
print(f'Model: {model} loaded successfully in {(end_model_loading-start_model_loading).total_seconds()} seconds.')
original_dataset = load_dataset("codeparrot/github-code", streaming=True, split="train", licenses=["mit", "isc"], languages = ['C#'], filter_languages=True, trust_remote_code=True)
BATCH_SIZE = 100
instance_number = 0
batch_dataset = []
is_dataset_created = False
ss=0
for example in original_dataset:
prompt = f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a C# expert.
Your task is to refine the C# code enclosed within tags [C#] and [/C#].
Refined code should be enclosed with tags [refined_C#] and [/refined_C#].
Summary of changes should be enclosed with [code_changes] and [/code_changes].
You do not do anything more than user asks you do it.
You do not generate any additional text.
<|eot_id|><|start_header_id|>user<|end_header_id|>
You have the refine the code based on principles:
1. Class name: Refine C# class name by making sure:
a. It is PascalCase.
b. It is logical. By logical, I mean, it should be a noun and it should denote what it does.
2. Property name: Refine the properties in the class by making sure:
a. Private properties have _ before their names and public properties are PascalCase.
b. It is logical. By logical, I mean, there should be no unnecessary repetition. For example property name as 'empId' in class Employee is not clean as 'emp' is unnecessary. This is because the class name already sets the context. The correct name would be 'id'. But make sure that the property name makes sense.
3. Object name: Refine the objects instantiated by making sure:
a. It is logical. By logical I mean, the names should be expressive. For example 'Employee e' is not as expressive 'Employee employee'.
4. Method name: Refine methods in the class by making sure:
a. It is logical. By logical, I mean the name of method should express what it does.
b. It is PascalCase.
[C#]
{example["code"]}
[/C#]<|eot_id|><|start_header_id|>assistant<|end_header_id|>[refined_C#]
"""
no_of_retries = 0
should_try = True
while should_try:
sequences = pipeline(prompt,temperature=0.2,
top_p=0.9,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=5000,
repetition_penalty=1.1)
output = ""
for seq in sequences:
output += seq["generated_text"]
code_pattern = r'\[refined_C#\](.*?)\[/refined_C#\]'
code_matches = re.findall(code_pattern, output, re.DOTALL)
is_code_okay = False
if len(code_matches) == 2:
refined_code = code_matches[1]
is_code_okay = True
is_summary_okay = False
summary_pattern = r'\[code_changes\](.*?)\[/code_changes\]'
summary_matches = re.findall(summary_pattern, output, re.DOTALL)
if len(summary_matches) == 2:
summary = summary_matches[1]
is_summary_okay = True
if (is_code_okay and is_summary_okay):
batch_dataset.append({'code': example["code"], 'refined code': refined_code, 'summary': summary})
instance_number += 1
no_of_retries += 1
if (is_code_okay and is_summary_okay) or no_of_retries == 5:
should_try = False
if instance_number == BATCH_SIZE:
if not is_dataset_created:
new_dataset = IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset})
new_dataset.push_to_hub('llama-3-1-refined-code')
is_dataset_created = True
print('Pushed data for first time')
else:
refined_code_dataset = load_dataset('llama-3-1-refined-code', streaming = True)
new_dataset = concatenate_datasets(refined_code_dataset, IterableDataset.from_generator(data_generator, gen_kwargs={"dataset": batch_dataset}))
new_dataset.push_to_hub('llama-3-1-refined-code')
print('Pushed data again')
instance_number = 0
batch_dataset = []
ss += 1
if ss == 3:
break