import json
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import nltk
from konlpy.tag import Okt
import numpy as np
import re
import pandas as pd
# Ensure you have the necessary NLTK data files
nltk.download('punkt')
# Manually defined list of Korean stop words
korean_stop_words = set([
'은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '에서', '부터', '까지', '으로', '로', '보다', '와', '고', '도', '나',
'마저', '그리고', '그러나', '그래서', '그렇지만', '그런데', '따라서', '또는', '혹은', '저', '나', '너', '그', '그녀', '우리', '당신',
'여러분', '이것', '저것', '그것', '아', '오', '어', '야', '와', '허', '매우', '너무', '아주', '좀', '많이', '잘', '덜', '꼭', '대체로',
'역시', '특히', '다시', '마치', '또', '그냥', '즉', '거의',
])
def normalize_answer(s):
def remove_(text):
''' 불필요한 기호 제거 '''
text = re.sub(r"'", " ", text)
text = re.sub(r'"', " ", text)
text = re.sub(r'\\n', ' ', text)
text = re.sub(r'\\(사진\\)', ' ', text)
text = re.sub(r'△', ' ', text)
text = re.sub(r'▲', ' ', text)
text = re.sub(r'◇', ' ', text)
text = re.sub(r'■', ' ', text)
text = re.sub(r'ㆍ', ' ', text)
text = re.sub(r'↑', ' ', text)
text = re.sub(r'·', ' ', text)
text = re.sub(r'#', ' ', text)
text = re.sub(r'=', ' ', text)
text = re.sub(r'사례', ' ', text)
return text
def remove_parentheses_text(text):
'''괄호 안의 띄어쓰기 및 한자만 포함된 텍스트 제거'''
def should_remove(match):
content = match.group(1).strip()
# 띄어쓰기와 한자만 있는 경우 제거
return re.fullmatch(r'[\\s\\u4E00-\\u9FFF]*', content) is not None
return re.sub(r'\\(([^)]*)\\)', lambda x: '' if should_remove(x) else x.group(0), text)
def white_space_fix(text):
'''연속된 공백일 경우 하나의 공백으로 대체'''
return ' '.join(text.split())
# 전처리 함수들을 차례로 적용
return white_space_fix(remove_parentheses_text(remove_(s)))
def jaccard_similarity(query, document):
query_tokens = set(query)
document_tokens = set(document)
intersection = query_tokens.intersection(document_tokens)
union = query_tokens.union(document_tokens)
return len(intersection) / len(union)
def tokenize_korean(text, stop_words, method='pos'):
okt = Okt()
if method == 'morphs':
tokens = [word for word in okt.morphs(text) if word not in stop_words]
elif method == 'pos':
tokens = [word for word, pos in okt.pos(text) if word not in stop_words]
elif method == 'nouns':
tokens = [word for word in okt.nouns(text) if word not in stop_words]
return tokens
def process_json(data):
# Check if a GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load the SBERT model and move it to the appropriate device
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
model = model.to(device)
output_data = []
# Process each entry in the input JSON
for entry in tqdm(data, desc="Processing entries"):
ID = entry['id']
context = entry['context']
question = entry['question']
answer = entry['answer']
# Split the context into sentences using NLTK
sentences = nltk.sent_tokenize(context)
# Encode the question and sentences
question_embedding = model.encode(question, convert_to_tensor=True, device=device)
sentence_embeddings = model.encode(sentences, convert_to_tensor=True, device=device)
# Compute cosine similarity between the question and each sentence
cosine_similarities = util.pytorch_cos_sim(question_embedding, sentence_embeddings).squeeze().cpu().numpy()
# Tokenize the question with stop words removal
question_tokens = tokenize_korean(question, korean_stop_words, method='pos')
# Compute Jaccard similarity between the question and each sentence
jaccard_similarities = []
for sentence in sentences:
sentence_tokens = tokenize_korean(sentence, korean_stop_words, method='pos')
similarity = jaccard_similarity(question_tokens, sentence_tokens)
jaccard_similarities.append(similarity)
jaccard_similarities = np.array(jaccard_similarities)
# Calculate total similarities
total_similarities = cosine_similarities + 4 * jaccard_similarities
# Get the indices of the top 10 sentences based on total similarity
num_sentences = min(10, len(sentences))
top_indices = total_similarities.argsort()[-num_sentences:][::-1]
# Filter context to keep only the selected top 10 sentences, ensuring no duplicates
filtered_context = ' '.join(dict.fromkeys([sentences[i] for i in sorted(top_indices)]))
output_data.append({
"id": ID,
"context": filtered_context,
"question": question,
"answer": answer,
})
return output_data
# CSV 파일 경로
csv_file = './dataset/train.csv' #test.csv
data = pd.read_csv(csv_file)
json_data = []
# 데이터 전처리
for _, row in tqdm(data.iterrows(), total=data.shape[0]):
ID = row['id']
context = normalize_answer(row['context']) # context 열에 전처리 함수 적용
question = normalize_answer(row['question'])
answer = row['answer']
json_data.append({
"id": ID,
"context": context,
"question": question,
"answer": answer,
})
# 유사도 계산 및 문장 필터링
filtered_data = process_json(json_data)
# JSON 형식으로 저장
json_string = json.dumps(filtered_data, ensure_ascii=False, indent=4)
with open('train_preprocessed4.json', 'w', encoding='utf-8') as file:
file.write(json_string)
print("Done!")
Train only.
Test에 쓸거면 “answer” 관련 줄 4개 주석 처리 하면 됨