???? ?? ????? ???? ??? ??? ???? ???? AI ???? ??? ?? ? ????? ????. ?? ?? ??(LLM)? ??? ?? ??? ??? ??? AI? ??? ??? ???? ????, ??? ???, ??? ?? ??? ??? ? ??? ?????. ??? ???? ?? LLM? ?? ??? ??? ??? ???? ?? ????????? ?? ?? ? ??? ??? ?? ?? ???? ?????.
?? ??? LLM? ???? ?? ??? ???? ? ?? ???? ?? AI? ???? ??? ? ????.
?? ??? ??? LLM? ??? ???? ???? ??? ??? ?? ???? ?? ?????(SEA) ??? ???? ? ???? ?? ??? ????. ? ?? ??? ?? ???? ?? ??? ?? ??? ?????. ?? LLM? ?? SEA ??? ?? ?? ??? ? ??? ??? ???? ???, ??? ??? ???, ?? ??? ????.
??, SEA??? ???? ??? LLM? ???? ? ?? ??? ??? ??? ????. ? ?? ??? ?? ???, ????? ?? ???? ?? ?? ?? ????(NMLP)? ???? ?? 7,000? ???? ?? ??? ?????? ??????.
?? ?? ???? ???? 2?? ??????, ????? ??? ?? LLM? ???? ?? ??? ??? ??? ?? ? ??? ???? ???? ? ??? ??? ????. ??????? AI ???? ?? ??? ???? ?? ???? ??? LLM? ??? ????? ???? ?????.
??? ??? LLM? ?? ??? ???? ??? ???? ??? ??? ?? ????? ???? ?????. ??? ??? ??? ??? ???? ? ?? ???? ????, ????? ?????, ??? ??? ???? ??? ? ?? ?? ???? ???? ??? ? ??? ? ? ????.
NVIDIA NeMo? ???? ???? ??? AI? ??? ? ?? ?? ? ?? ??????. ???? ????? ??, ?? ?? ??(RAG), ???? ? ??, ??? ???? ??, ?? ????? ??? ???? ??? ??? AI? ?? ?? ????? ??? ??? ? ?? ???.
? ?????? NeMo? ???? ?? LLM? ??? ?? ??? ???? ?? ??? ?????. ? ??????? ????? ???? ? ??, ?? ???? ??, ?? ?? ?? ???? ?? ??? ?????.
? ?????? ??? ???? ???? ???? GPT-1.3B ??? ????? ?? ???????. 1???? ??? ????? ???? ? ??? ??? ??, 2???? NeMo ???? ??? ?????? ???? ?? ?? ????? ???? ?? ?? ?????.
??? ??? ??? ??? AI? ??? ???? ? ?? ? ?? ???? LLM? ??? ?? ? ????.
???? ??? LLM ???? ??
??? LLM? ??? ?? ? ??? ?? ??? ???? ?? ????? ????? LLM? ????? ????. ??? LLM? ???? ??? ??? ????.
- ??? ??? ??? ???? LLM? ???? ?? ???????.
- ?? ??? ??? ??? ???? ?? ?? ??? ?? ?? ?? ????? ?????.
???? ?? ??? ???? ??? ??? ? ?? ?????. ???? ?? ??? ??? ?? ?? ??? ???? ???? ???? ????. ?? ?? ????? ??? ?? ????? ???? ?? ???? ?? ????? ???? ??? ?? ?? ????? ??? ??? ??? ????? ???? ? ????.
???? ?? ?????? ?? ??? ?? ??? ????? ???? ?? ??? ? ?? ? ?? ?? ???? ?????.
?? ?? ????? ?? ???? ?? ???? ????? ? ? ???? ? ?? ??? ??? ?????? ???? ??? ????. ???? ????? ??? ??? ?? ???(BPE) ?????? ?????. ?? ?????? ???? ?? ??? ??? ??, ?? ??, ???? ??? ??? ?? ????.
???? ??? ?????? ??? ??? ???? ?? ??? ????? ???? ??? ??? ??? ???? ????. ??? ?? ?? ???? ?? ???? ?? ?? ???? ? ????? ???? ????? ? ??? ?? ??? ?????? ???? ???.
??? ??? ??? ?? NVIDIA? LLM? ?? ??? ?? ??? ???? ?? ?????? ?????.?

? ?????? ??? ???? ???? ?? ??? ?? ???? ?????.
- GPT ??? ?????? ???? ?? ???? ?? ?????? ????.
- ????? ????? ????? ???? ?? ?? ?????? ?????.
- ?? ?? ?????? ????? GPT ?? ????? ?????.
- ??? ???? ???? ??? ?? ?? ????? ?????.
? ?????? ????? ??? ?? ??? ??? ??? ? ????. 1??? 2??? ? ????? ????. 3??? 4??? 2?? ?????.
???? ?? ??
GPT-1.3B ??? ?? ?? ????? ?? ?? ???? ??? ?????.
- 30GB ??? ??? ?? ???? ??? NVIDIA GPU
- CUDA ? NVIDIA ????: CUDA 12.2(???? 535.154.05 ??)
- Ubuntu 22.04
- NVIDIA-container-toolkit ?? 1.14.6
- NeMo ????? ???? 24.01.01
NGC ????? ?? ???? ????, ?? ????? AI ??, ?????? ???? ?? ??? ??? ? ?? ??? SDK? ?? ?? ? ?? ?????? ????? GPU ?? ?????? ?? ???? ?????.
? ?? ???, NGC ?????? NeMo ????? ????? ?????? ???? ????? JupyterLab? ?????.
docker pull nvcr.io/nvidia/nemo:24.01.01.framework?docker run -it --gpus all -v <your working directory>:<your working directory in container> --workdir <your working directory> -p 8888:8888/ nvcr.io/nvidia/nemo:24.01.01.framework bash -c "jupyter lab"
??? ?? ? ??
? ??????? GitHub? NVIDIA NeMo ???? ?????? ???? ??? ????? ??? ???? ?????? ???????. NVIDIA NeMo ????? LLM ????? ?? NLP ???? ?????? ?? ?? ??? ??? ??? ?? ????? ?????. NLP ???? NeMo ???? ?? ??? ?? ?????? ?? ??? ? ????? ??? ???? ???? ???? ? ????.
???? ?????? ?? ?? ??? ????.
- ??? ???? ???? ?? ???? ??????.
- ??? ?? ???? ????? ?????.
- ?? ??? ?? ?? ?? ? ?? ?? ??? ???? ??? ??? ???? ?????.
- ?? ??? ???? ???? ???? ??? ?? ??? ?????.
?? ??? ?? ???? ????? NVIDIA NeMo ?????? ??? ?? ??? ???? ???? ??? ? ????.
?? ???? ? ??
? ?????? nemo-megatron-gpt-1.3B ??? ?????. ? ??? Pile ??? ??(?? ?? ?? ??? ??)?? ?????????. HuggingFace?? ?? ??? ????? ? ??? ???. ?? ?? ??? ???? ??? ????? ? ????.
!wget -P './model/nemo_gpt_megatron_1pt3b_fb16/' https://huggingface.co/nvidia/nemo-megatron-gpt-1.3B/resolve/main/nemo_gpt1.3B_fp16.nemo
????? ??? MD5 ???? ???? ???? ?????.
!md5sum nemo_gpt1.3B_fp16.nemo
??? ??? ??? ???.
38f7afe7af0551c9c5838dcea4224f8a? nemo_gpt1.3B_fp16.nemo
??? ????? ? ???? vocab.json ? merge.txt ??? ?????.
!tar -xvf ./model/nemo_gpt_megatron_1pt3b_fb16/nemo_gpt1.3B_fp16.nemo -C ./model/nemo_gpt_megatron_1pt3b_fb16/
? ??? ?? ??? ?????. ?? vocab.json ? merge.txt ??? ???? ? ???, ?? ??? ????? ??? ?????.
./
./50284f68eefe440e850c4fb42c4d13e7_merges.txt ./c4aec99015da48ba8cbcba41b48feb2c_vocab.json ./model_config.yaml ./model_weights.ckpt
????? ?????
?? ??? ??? ???? ? ?? ?????? ?????? ?? ?? ? ?? ?? ?? ? ??? ??? ? ????.
- ??? ??? ??: ??? ??? ??? ??? ??? ???? ???? ?????? ???????. ??? ??? ??? ??? ?? ??? ?? ? ??? ????.
- ?? ?? ??? ??: ?? ?? ?????? ????? ??, ?? ?? ?????? ?????. ??? ?? ??? ?? ?? ??? ???? ????? ??? ??? ???? ???? ? ??? ????. ????? ????? ???? ??? ?? ????.
? ??????? ?? ?? ??? ?? ?? ??? ???? ?? ????? GPT Megatron ??? ??? ???? ?????.
? ??? ??? ??? ??? ????.
- ????? ???? ??? ??: ?? ?? ????? ?? ??? ??? ??? ?? ??? ????. ? ??????? ???? ???? 30%? ????? ????? ?? ???? ??????.
- ??? GPT2 ????? ????: ?? ????? HuggingFace GPT2 ?????? ????? ???? ?? ??? ???? ???? TH GPT2 ?????? ???????.
- ? ????? ??: ? ????? ??? ?? merges.txt ? vocab.json? ???? ?????.
? ??????? ???? ?? ??? ?????.
??? ????? ????
???? ?? ?? ?????? ?????.
import os
from transformers import GPT2Tokenizer, AutoTokenizer
import random import json
????? ??? ??
????? ?? ????? ????? .txt ??? ?? ??? ???? ?? convert_jsonl_to_txt? ?????. ? ??????? ‘text’? JSON ?? ???? ????? ?? ???? ??????. ??? ?? ?? ?????.
def convert_jsonl_to_txt(input_file, output_file, percentage, key='text'):
with open(input_file, 'r', encoding='utf-8') as in_file, open(output_file, 'a', encoding='utf-8') as out_file:
for line in in_file:
if random.random() < percentage:
data = json.loads(line)
out_file.write(f"{data[key].strip()}\n")
?? ?? ??? ?? ????? ????? ???? ??? ? ????.
for file in os.listdir('./training_data'):
if 'jsonl' not in file:
continue
input_file = os.path.join('./training_data',file)
convert_jsonl_to_txt(input_file,'training_corpus.txt', 0.3)
with open('training_corpus.txt', 'r') as file:
training_corpus = file.readlines()
????? ???? ?? ?? ??? ??? ??? ? ?? ?? ??? ???? ???? ????? ???? ?????. ??? ??? ?? ??????? ? ????? ??????? ?????.
?? ?? ????? ????
?? ????? GPT2 ?????? ????? ??? ??, tokenizer.train_new_from_iterator ???? ???? ? ?????? ???????.
Vocab_size? tokenizer.train_new_from_iterator? ??? ?? ?????. ?? ???? ??? ??? ?? ?? ?????. ?? ??? ? ???? ???? ????? ?? ???? ???? ??, ?? ???? ??? ?? ?? ??? ??? ? ???? ? ???? ???? ?????.
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=8000)
new_tokenizer.save_pretrained('./new_monolingual_tokenizer/')
??? ?? ?? ????? ????? ?????? ?? ?? ???? ? ?????? ??? ?????. ?? ????? GPT2 ?????? TH ?????? ?? ???? ?? ??? ??? ?? ??? ??????.
- ??? ??: “????????????????????????????????”(“??? ??? ?????.”?? ?)
- ?? ??: ?“The capital of Thailand is Bangkok.”(“??? ??? ?????.”?? ?)
Thai_text='????????????????????????????????'
print(f"Sentence:{Thai_text}")
print("Output of TH tokenizer: ",new_tokenizer.tokenize(Thai_text,return_tensors='pt'))
print("Output of pretrained tokenizer: ", old_tokenizer.tokenize(Thai_text,return_tensors='pt'))
Eng_text="The capital of Thailand is Bangkok."
print(f"Sentence:{Eng_text}")
print("Output of TH tokenizer: ",new_tokenizer.tokenize(Eng_text,return_tensors='pt'))
print("Output of pretrained tokenizer: ", old_tokenizer.tokenize(Eng_text,return_tensors='pt'))
?? ?? ??? ??? ?????.
Sentence:????????????????????????????????
Output of TH tokenizer: ['à1?à??', 'à?·', 'à??à??à??à?¥à?§à??', 'à??à??à??à??à?£à?°à1?à??à?¨à1?à??à?¢', 'à??', 'à?·', 'à??à??à?£', 'à??', 'à??à1?à??à??à?ˉ']
Output of pretrained tokenizer: ['à1', '?', 'à?', '?', 'à?', '·', 'à?', '?', 'à?', '?', 'à?', '?', 'à?', '¥', 'à?', '§', 'à?', '?', 'à?', '?', 'à?', '?', 'à?', '?', 'à?', '?', 'à?', '£', 'à?', '°', 'à1', '?', 'à?', '?', 'à?', '¨', 'à1', '?', 'à?', '?', 'à?', '¢', 'à?', '?', 'à?', '·', 'à?', '?', 'à?', '?', 'à?', '£', 'à?', '?', 'à?', '?', 'à1', '?', 'à?', '?', 'à?', '?', 'à?', 'ˉ']
Sentence:The capital of Thailand is Bangkok.
Output of TH tokenizer: ['The', '?c', 'ap', 'ital', '?of', '?Thailand', '?is', '?B', 'ang', 'k', 'ok', '.']
Output of pretrained tokenizer: ['The', '?capital', '?of', '?Thailand', '?is', '?Bangkok', '.']
???? TH ?????? ??? ??? ?? ?? ?????? ?? ?? ?? ?? ???? ???? ?? ??? ?? ? ??? ?? ? ? ????.
? ??? ?? ??? ??, ?? ??? ?? ??? ???? ??? ?? ??????? ??? ???? ?? ???? ?? ?????. ?????? ?? ?? ???? ????? UNK
???? ???? ?? ?? ?? ? ????.
????? ??
? ?????? ????? vocab.json
? merges.txt
??? ???? ???. ??? ???? ??? ??? ????.
vocab.json
??? ??:
- ?? ????? ??????
vocab.json
ID-?? ??? ?????. - ? ??? ???? ??? ?? ?? ??????
vocab.json
??? ?????. - ?? ?? ?? ID? ?? ??
vocab.json
??? ?????.
merges.txt
??? ??:
- ?? ????? ??????
merges.txt
??? ???? ?? ??? ?????. - ??? ?? ?? ??????
merges.txt
??? ?????.? - ??? ?? ??? ???? ?? ??
merges.txt
? ?????.?
??? ??? ? ?? ??? ???? vocab.json
?? ?? merges.txt
??? ???? ?? ? ??? ????.?
vocab.json
? ?? ?? ????? ??? ???? ?????? ?? ID-?? ??? ???? ???? ???. ?? ??? ?????? ?? ????? ??? ????‘dog’
??? ???? ????? ? ? ??? ??? ?? ?? ??? ?? ?? ‘dog’
??? ?? ID? ????? ‘cat’
? ?? ?? ??? ?? ????? ???? ??? ? ????.
merges.txt
? ?? merge.txt
? ?? ?? ??? ? ???? ???? ? BPE ?????? ???? ???? ? ?????. ?????? ??? ??? ? ?? ???? ???? ? ?? ??? ??? ?? ??? ???? ?? ????? ?????. ?? ??? ??? ???? ?????? ??? ??? ??? ?? ??? ???? ????? ?? ? ????.?
?? ?? ??? ????. ?? ??? ['N', 'VI', 'D', 'IA']
? ??? ?? ? ?? ?? ?? ??? ??? ??? ?????.
Set A:
N VI
D IA
NVI DIA
Set B:
D IA
NVI DIA
N VI
?? A? ?? ???? ??? ? ?????? ??? ??? ?? ??? ????.
['N', 'VI', 'D', 'IA'] -> ['NVI', 'D', 'IA']
(?? 1 ???.)['NVI', 'D', 'IA'] -> ['NVI', 'DIA']
(?? 2 ???.)['NVI', 'DIA'] -> ['NVIDIA']
(?? 3 ???.)
???? ?? ??? ??? ??? ['NVIDIA']
???.
??? ?? B? ??? ?? ???? ???? ?????? ??? ?????.
['N', 'VI', 'D', 'IA'] -> ['N', 'VI', 'DIA']
(?? 1 ???.)['N', 'VI', 'DIA']
(?? ?? ?? ‘NVI
‘? ? ?? ??? ?? ? ???? ?? ??? ??? ? ??.)
? ?? ?? ?? 'N VI'
? ?'NVI DIA'
?? ????? ?????? ?'N'
? 'VI'
??? ?????. ????? ?????? ??? ['NVIDIA']
?? ??? ?? ?['N', 'VI', 'DIA']
? ?????.
??? ??? ???? ?????? ??? ???? ????? ??? ?????.
????? ??? ?? ?? ??? ?????.
output_dir = './path_to_merged_tokenizer'
# Make the directory if necessary
if not os.path.exists(output_dir ):
os.makedirs(output_dir)
#Read vocab files
old_vocab = json.load(open(os.path.join('./path_to_pretrained_tokenizer', 'vocab.json')))
new_vocab = json.load(open(os.path.join('./path_to_cusotmized_tokenizer', 'vocab.json')))
next_id = old_vocab[max(old_vocab, key=lambda x: int(old_vocab[x]))] + 1
# Add words from new tokenizer
for word in new_vocab.keys():
if word not in old_vocab.keys():
old_vocab[word] = next_id
next_id += 1
# Save vocab
with open(os.path.join(output_dir , 'vocab.json'), 'w') as fp:
json.dump(old_vocab, fp, ensure_ascii=False)
old_merge_path = os.path.join('./path_to_pretrained_tokenizer', 'merges.txt')
new_merge_path = os.path.join('./path_to_cusotmized_tokenizer', 'merges.txt')
#Read merge files
with open(old_merge_path, 'r') as file:
old_merge = file.readlines()
with open(new_merge_path, 'r') as file:
new_merge = file.readlines()[1:]
#Add new merge rules, the order of merge rule has to be maintained
old_merge_set = set(old_merge)
combined_merge = old_merge + [merge_rule for merge_rule in new_merge if merge_rule not in old_merge_set]
# Save merge.txt
with open(os.path.join(output_dir , 'merges.txt'), 'w') as file:
for line in combined_merge:
file.write(line)
?? ??? ?????? ?? ? ????? ??? ??? ?? ????? ????? ? ??? ?? ?? ?????? ??? ? ????. ??? ?????? ?? ??? ?? ??? ????? ? ? ???? ?? ??? ? ????.
??
???? ??? ?? ??? ???? ? ?? BPE ?????? ????? ??? ??????.
?? ?????? ?? ????? ??? ??? ???? ???? ??? ?????? ???? NeMo?? ?? ?? ????? ?? ??? ?????? ?? ??? ??? ???? ?????.
??? ????? ????? ????? GitHub?? ?? ?? NeMo ????? ???? ? ???? ????? ?? ???? ?? ?? ??? ??? ???????. ?? NeMo ???? ??? ?? ??? ????? ???? NVIDIA NeMo ????? ?? ???? ???? ??? ???? ?????? ????? ???? ?? ????.
?? ???
- GTC ??: NVIDIA NeMo ? AWS? ?? LLM ???? ???
- GTC ??: NVIDIA NeMo? ?? ??? ??? ????? ?? ?? ?? ???
- GTC ??: NeMo, TensorRT-LLM, Triton ?? ??? ???? LLM ?? ?? ? ??
- SDK: NeMo ?? ???? ???
- SDK: NeMo LLM ???
- ???: ?? ??? ?? ??? AI