Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import csv
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def create_dataframe(matrix, tokens):
doc_names = [f"doc_{i+1}" for i in range(len(matrix))]
df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
return df
reference_dataset = load_dataset("atharva2721/qwen_inference_output_complete", split="train", trust_remote_code=True)
code_number = 0
with open(f'eval_reports/cosine-similarity-qwen-base-responses-evaluation-pass.csv', 'w') as f:
fieldnames = ['code number', 'cosine similarity score (CountVectorizer)', 'cosine similarity score (TfidfVectorizer)']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for example in reference_dataset:
code_number +=1
doc_1 = f"{example['base inference']}"
doc_2 = f"{example['reference inference']}"
data = [doc_1, doc_2]
count_vectorizer = CountVectorizer()
vector_matrix_count = count_vectorizer.fit_transform(data)
count_tokens = count_vectorizer.get_feature_names_out()
df_count_tokens = create_dataframe(vector_matrix_count.toarray(), count_tokens)
cosine_similarity_matrix_count = cosine_similarity(vector_matrix_count)
df_cosine_count = create_dataframe(cosine_similarity_matrix_count, ["doc_1","doc_2"])
print(f"Cosine Similarity (CountVectorizer) for code number {code_number} :: {df_cosine_count}")
#-----------------------------------------------------------------------------------------------
tfidf_vectorizer = TfidfVectorizer()
vector_matrix_tfidf = tfidf_vectorizer.fit_transform(data)
tfidf_tokens = tfidf_vectorizer.get_feature_names_out()
df_tfidf_tokens = create_dataframe(vector_matrix_tfidf.toarray(), tfidf_tokens)
cosine_similarity_matrix_tfidf = cosine_similarity(vector_matrix_tfidf)
df_cosine_tfidf = create_dataframe(cosine_similarity_matrix_tfidf, ["doc_1","doc_2"])
print(f"Cosine Similarity (TfidfVectorizer) for code number {code_number} :: {df_cosine_tfidf}")
writer.writerow({'code number': code_number, 'cosine similarity score (CountVectorizer)': df_cosine_count.loc["doc_1", "doc_2"], 'cosine similarity score (TfidfVectorizer)': df_cosine_tfidf.loc["doc_1", "doc_2"]})
print("Calculate cosine similarity")