- 논문 읽으며 모델 구조 디테일 이해
- 논문 코드 BERT/BART ⇒ KoBERT/KoBART 바꾸기
BoB-20230516T083051Z-010.zip
KcBERT 결과
- [x] read_mbti_split @김채현
- [x] read_nli_split @진시윤
- [x] preprocess data @진시윤
- [x] json파일 뽑기! (mbti/mbiti_tokenized) @진시윤
- [x] T/F persona text input @모두
- [x] aihub split @진시윤
- [ ] model 구조
- [ ] 저는 도토리입니다.. 저는 감자요..
KoBERT 사용법
KoBART
import torch
import json
import torch.utils as utils
from torch.utils.data import Dataset
"""
<https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=543>
"""
#
class AIHubDataset():
def __init__(self, speaker1, speaker2, queries, responses, device):
self.speaker1 = speaker1
self.speaker2 = speaker2
self.queries = queries
self.responses = responses
self.device = device
def __getitem__(self, idx):
speaker1 = {
key : torch.tensor(val[idx]).to(self.device)
for key, val in self.speaker1.items()
}
speaker2 = {
key : torch.tensor(val[idx]).to(self.device)
for key, val in self.speaker2.items()
}
query = {
key : torch.tensor(val[idx]).to(self.device)
for key, val in self.queries.items()
}
response = {
key : torch.tensor(val[idx]).to(self.device)
for key, val in self.responses.items()
}
return {'persona1': speaker1, 'person2' : speaker2, 'query' : query, 'response' : response}
def __len__(self):
return len(self.responses['input_ids'])
class MBTIDataset(Dataset):
def __init__(self, q_mbti, a_mbti, questions, answers, device):
self.q_mbti = q_mbti
self.a_mbti = a_mbti
self.queries = questions
self.responses = answers
self.device = device
def __getitem__(self, idx):
# mbti를 토큰으로 넣고 차라리 페르소나 지정해주는 방법도..
q_mbti = {
key: torch.tensor(val[idx]).to(self.device)
for key, val in self.q_mbti.items()
}
a_mbti = {
key : torch.tensor(val[idx]).to(self.device)
for key, val in self.a_mbti.items()
}
query = {
key: torch.tensor(val[idx]).to(self.device)
for key, val in self.queries.items()
}
response = {
key: torch.tensor(val[idx]).to(self.device)
for key, val in self.labels.items()
}
return {'q_persona': q_mbti, 'a_persona' : a_mbti, 'query': query, 'response': response}
def __len__(self):
return len(self.responses['input_ids'])
class NLIDataset(Dataset):
def __init__(self, pre, hyp, device):
self.pre = pre
self.hyp = hyp
self.device = device
def __getitem__(self, idx):
pre = {
key: torch.tensor(val[idx]).to(self.device)
for key, val in self.pre.items()
}
hyp = {
key: torch.tensor(val[idx]).to(self.device)
for key, val in self.hyp.items()
}
return {'pre': pre, 'hyp': hyp}
def __len__(self):
return len(self.pre['input_ids'])
Split