Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
def formatting_prompts_func(examples):
convos = examples["conversations"]
texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
return { "text" : texts, }
pass
def format_to_conversations(examples):
conversations = []
codes = examples["code"]
refined_codes = examples["refined code"]
summaries = examples["summary"]
for i in range(len(refined_codes)):
user_content = f'''Refine the C# code enclosed within tags [C#] and [/C#].
[C#]
{codes[i]}
[/C#]
'''
assistant_content = f'''
[refined_C#]
{refined_codes[i]}
[/refined_C#]
[code_changes]
{summaries[i]}
[/code_changes]
'''
conversation = []
user_dict = {'content': user_content, 'role': 'user'}
assistant_dict = {'content': assistant_content, 'role': 'assistant'}
conversation.append(user_dict)
conversation.append(assistant_dict)
conversations.append(conversation)
return { "conversations" : conversations }
pass
max_seq_length = 32768 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
tokenizer = get_chat_template(
tokenizer,
chat_template = "llama-3.1",
)
dataset = load_dataset("atharva2721/refined-train-aggregated", split = "train")
dataset = dataset.map(format_to_conversations, batched = True,)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset.push_to_hub('llama-standardized-refined-test-aggregated')
print('Dataset pushed to hub')