NVIDIA NeMo? ??? ???? ??? LLM ????, 2?

Reading Time: 6 minutes

1???? ?? ?? ?????? ?????? ?? ?? ????? LLM? ?????? ???? ??? ?????? ???? ??? ??????. ? ?????? ??? ?????? ?? ????? LLM? ???? ??? NVIDIA NeMo?? ?? ?? ???? ??? ???? ??? ?? ???.?

??

???? ?? ?? ?????? ?????.

import torch 
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel 
from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder 
from omegaconf import OmegaConf

?? ??

?? ? ??? ?????? ?? ??? GPT-megatron-1.3B ?? ?? ????? ?????? ?? ???? ???. ?, ??? ?????? ????? GPT-megatron-1.3B ??? ??? ???? ???? ???(?? 2).?

?? ???? ??? ?????.?

??? ?? ??? ?? ? ??? ???? ????.
?? ??? ????? ???? ???? ?? ??????.
??? ?? ??? ???? 0?? ?????.

??? ? ??? ??? ???? ?? ????? ??? ?? ???? ???? ?? ?? ???? ???? ?? ????? ??? ????? ??? ??? ?? ??? ??? ? ????.

??? ??? ?? ? ??

?? ??? ???? GPT-megatron-1.3B.nemo ??? ?????.

#Initialization
trainer_config = OmegaConf.load('/opt/NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml')
trainer_config.trainer.accelerator='gpu' if torch.cuda.is_available() else 'cpu'
trainer = MegatronTrainerBuilder(trainer_config).create_trainer()
#load gpt-megatron-1.3b.nemo and its config
nemo_model = MegatronGPTModel.restore_from('./path_to_1.3B_nemo_model',trainer=trainer)
nemo_config = OmegaConf.load('./path_to_1.3B_nemo_model_config.yaml')

??? ??? ? ?? state_dict ?? ???? ??? ???? ???? ??? ? ????. ??? ???? ? ??? ???? ???? ? ?????.

#Extract original embedding layer
embed_weight = nemo_model.state_dict()[f'model.language_model.embedding.word_embeddings.weight']
print(f"Shape of original embedding layer: {embed_weight.shape}")

? ??? ??? ??

?? ? ??? ???? ?? ??? ??? ?? ?? ??? ???? ???. ? ??? ???? ??? ???? ?? ?? ??? ???? ???? ? ??? ???? ?????.

??? ??? ???? ?????.

??? ????? ?? ??
?? ??? ??? ??
model_config.yaml? ?? ??, model.make_vocab_size_divisible_by

??? ?? ???? ??? ????.

??? ????? ?? ?? =
?? NeMo ??? ??? ?? =
model.make_vocab_size_divisible_by =

????? ?? ???? ????? ?? ??? ???? 8? ??? ?? ? ?? ??? ??? ????. ????? ?? ??? ??? ? ???? ??? ??? ???? ??? ???.

???? ?????? ?? ? ????? ????? ??? ? ???? ? ??? ???? ???? ??? ???.

tokenizer = AutoTokenizer.from_pretrained('./path_to_new_merged_tokenizer')
if len(tokenizer)% nemo_config.make_vocab_size_divisible_by != 0:
  tokenizer_diff = (int(len(tokenizer)/nemo_config.make_vocab_size_divisible_by)+1) * nemo_config.make_vocab_size_divisible_by - embed_weight.shape[0]
else:
  tokenizer_diff = tokenizer.vocab_size - embed_weight.shape[0]

?? ? ??? ?? ???? ?? ??? ??? ? ????. ?? ??, ? ??? ??? ??? ?? ??? ???? ???? ? ??? ???? ?????.

hidden_size = embed_weight.shape[1]
random_embed = torch.zeros((tokenizer_diff, hidden_size)).to('cuda')
new_embed_weight = torch.cat((embed_weight, random_embed), dim=0)

? ?? ?? ? ??

? ????? ?? ??? ????? ?? ??? ???? ? ??? ????. ? ???? ??? ?? ??? ???? ???? ?? ?????. ?? ???? ??? ???? ?? ???? ?? ??? ?? ??? ??? ?????.

????? ????? ???? ? ?? ????? ???? ? ??? ???? ?? ?? ????? ??? state_dict ?? ?????.

?????, ? ??? ??? .nemo ???? ???? ??? ??? ?? ???? ?? ????? ?????.

state_dict = nemo_model.state_dict()
state_dict[f'model.language_model.embedding.word_embeddings.weight'] = new_embed_weight
 
NEW_TOKENIZER_PATH = './path_to_new_merged_tokenizer'
nemo_config['tokenizer']['vocab_file'] = f"{NEW_TOKENIZER_PATH}/vocab.json"
nemo_config['tokenizer']['merge_file'] = f"{NEW_TOKENIZER_PATH}/merges.txt"
nemo_config['vocab_file'] = f"{NEW_TOKENIZER_PATH}/vocab.json"
nemo_config['merges_file'] = f"{NEW_TOKENIZER_PATH}/merges.txt"
 
new_nemo_model = MegatronGPTModel(nemo_config,trainer)
new_nemo_model.load_state_dict(state_dict)
new_nemo_model.save_to('./path_to_modified_nemo_model')

?? ??? ???? ? ??? ?? ?????? ? ????? ?????.

python /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_eval.py \ gpt_model_file='./path_to_modified_nemo_model' \
prompts='ENTER YOUR PROMPT' \
inference.greedy=True \
inference.add_BOS=True \
trainer.devices=1 \
trainer.num_nodes=1 \
tensor_model_parallel_size=-1 \
pipeline_model_parallel_size=-1

??? ???

??? ?? ????, ??, ???? ?? ??? ?? ?? ????? ????? ?????. ??? ??? 3??: ???? ????, ??, ???? ??? ?????.

--json_key ?? ??? ??? ?? ???? ??? ?? ?????.

python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ --input='./path_to_train/val/test_dataset' \
--json-keys=text \
--tokenizer-library=megatron \
--vocab './path_to_merged_tokenizer_vocab_file'\
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file './path_to_merged_tokenizer_merge_file' \
--append-eod \
--output-prefix='./path_to_output_preprocessed_dataset'

?? ?? ????

?? ?? ????? ?? ?? ?? ??? ??? ?? ?? ??? ?? ? ????. ?? ??? ???? ??? ??? ?????. ?? ?? ????? ? ??? ??? ?? ??? ???????.

ori_conf = OmegaConf.load('./path_to_original_GPT-1.3B_model/model_config.yaml')
conf = OmegaConf.load('/opt/NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml')
for key in ori_conf.keys():
  conf['model'][key] = ori_conf[key]
# Set global_batch_size based on micro_batch_size
conf['model']["global_batch_size"] = conf['model']["micro_batch_size"] * conf.get('data_model_parallel_size',1) * conf.get('gradient_accumulation_steps',1)
# Reset data_prefix (dataset path)
conf['model']['data']['data_prefix'] = '???'
# Reset tokenizer config 
 
NEW_TOKENIZER_PATH = "./path_to_new_merged_tokenizer"
conf['model']['tokenizer']['vocab_file'] = f"{NEW_TOKENIZER_PATH}/vocab.json"
conf['model']['tokenizer']['merge_file'] = f"{NEW_TOKENIZER_PATH}/merges.txt"
conf['model']['vocab_file'] = f"{NEW_TOKENIZER_PATH}/vocab.json"
conf['model']['merges_file'] = f"{NEW_TOKENIZER_PATH}/merges.txt"
OmegaConf.save(config=conf,f='/opt/NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml')

?? ??? ???? ?? ?? ????? ?????. ?? ???? ? ??? ?? ?? ?? ??? ???? ???.

nproc_per_node: ??? GPU ??.
model.data.data_prefix: ??? ?? ????, ??, ???? ??. ??? ?? ??? ?????.
exp_manager.name: ?? ?? ??. ?? ?????? ./nemo_experiments/<exp_manager.name> ??? ?????.
trainer.devices: ??? GPU ??.
trainer.num_nodes: ?? ?.
trainer.val_check_interval: ???? ? ?? ??? ???? ?? ??(???).
trainer.max_steps: ???? ??? ?? ??.
model.tensor_model_parallel_size: 1.3B ??? ?? 1? ?????. ? ? ???? ? ? ??? ?????.
model.pipeline_model_parallel_size: 1.3B ??? ?? 1? ?????. ? ? ???? ? ? ??? ?????.
model.micro_batch_size: ??? ?? vRAM? ?? ?????.
model.global_batch_size: micro_batch_size ?? ?? ????. ??? ??? ??? ?????.

DATA = '{train:[1.0,training_data_indexed/train_text_document], validation:[training_data_indexed/val_text_document], test:[training_data_indexed/test_text_document]}'
 
!torchrun --nproc_per_node=1 \ /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_continue_training.py \ 
"model.data.data_prefix={DATA}"\ 
name=megatron_gpt_ \ 
exp_manager.name=megatron_gpt_1 \ 
restore_from_path='./path_to_modified_nemo_model' \ 
trainer.devices=1 \ 
trainer.num_nodes=1 \ 
trainer.precision=16 \ 
trainer.val_check_interval=300 \ 
trainer.max_steps=1200 \ 
model.megatron_amp_O2=False \ 
model.tensor_model_parallel_size=1 \ 
model.pipeline_model_parallel_size=1 \ 
model.micro_batch_size=1 \ 
model.global_batch_size=1 \ 
++model.use_flash_attention=False \
++model.seq_len_interpolation_factor=null

?? ??

???? ??? ?? ??? ./nemo_experiments ??? ??, ?????. ? ???? ??? ?? ????? ??? hparams.yaml? ??? ???

?? ??? ???? ????? ??? ???? ?????.

python /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_eval.py \
'checkpoint_dir="./path_to_checkpoint_folder"' \
'checkpoint_name="name of checkpoint file in .ckpt format"' \
'hparams_file="./path_to_hparams_file"' \
prompts='ENTER YOUR PROMPT' \
inference.greedy=True \
inference.add_BOS=True \
trainer.devices=1 \
trainer.num_nodes=1 \
tensor_model_parallel_size=-1 \
pipeline_model_parallel_size=-1

? 1? ?? GPT-megatron-1.3B ??? ??? ???? ???? ???? ????? GPT-megatron-1.3B ???? ??? ?? ??? ??? ????. ? ?????? ?? ?? ?? ??? ????.

Prompt	Output of GPT-megatron-1.3B.nemo	Output of Trained GPT-megatron-1.3B-TH.nemo
The capital city of Thailand is?	The capital city of Thailand is from the capital city of Bangkok. The capital city of Thailand is from the capital city of Bangkok.\n\nHistory\n\nThe capital city of Thailand was originally called Chiang Mai, which means “city of the sun”. The capital city of Thailand was originally called Chiang Mai, which means “city of the sun”.?	The capital city of Thailand is by the Thai Government of Thailand\n\n??????????????? \n\n??????????????????????\n?????????????????????\n(The Capital city of Thailand is by the Thai Government of Thailand\n\nOther sources \n\nCities in Chiang Rai Province\nAncient cities in Thailand\n)?
????????????????????????????????????????? ???????????????????????????????????????? (Bangkok is a popular tourist destination. The most famous tourist attractions are )?	????????????????????????????????????????? ??????????????????????????????????????????????????????? (Bangkok is a popular tourist destination. The most famous tourist attraction is the most famous working place.) ?	????????????????????????????????????????? ???????????????????????????????????????? “??????????????” ??????????????????????????????????????????????????? (Bangkok is a popular tourist destination. The most famous tourist attractions are: “Wat Thammathipatai” which is the temple that King Chulalongkorn built.)?

? 1. ?? ?? ??

???? ? ??? ???? ?? ???? ????? ??? ?? ??? ???????. ?? ?? ?? ??? ??? ??? ?? ?? ?????? ?? ?? ?? ?????. ?? ????? ??? ?? ??? ?? ???? ???? ?????? ?? ????.?

??

? ?????? ??? ????? LLM? ?? ??? ????? ???? LLM? ?? ??? ? ???? ???? ????? ? ? ????. ? ?? ??? ?? ?? ???? ?? ????? ?? ??? ??? ???? ??? ??? ?? ????? ?? ??? ?? ??? ???? ?? ????? ?????.

? ????? ?? ??? ????? ???? ? ?? ?? ????? ???? ?? ?? ???? ?? ?? ?? ????. ??? ??? ???? ?? ???? ??? ???? ???? ??? ???? ????? ???? ??? ?????.

????? NeMo ????? ????? ??????? GitHub?? /NVIDIA/NeMo ?? ?? ?????? ???? ? ?????. ???? ?? ???? ????? ????? ??? ??? ???? ? ???? ??? ?? ????? LLM? ??? ? ?? ??? ??? ? ????.

NeMo ???? ??? ?? ??? ????? ???? NVIDIA NeMo ???? ? NVIDIA NeMo ?????? ???? ???? ?? ???? ??? ?? ????. ??? ???? ???? LLM? ??? ???? ? ???? ????? ???? ? ??? ??? ? ??? ?????.