Inference question through LoRA in Whisper model

I trained Whisper model through LoRA.
But there's a problem.

The original model directory I trained has a capacity of 2.7G. However, the size of the model directory learned through LoRA is 57M.

From this, I found that only additional weighting information was save to the LoRA directory.
(That is, the original weighting information is not included.)

Therefore, this is a question.
How can I combine the existing Whisper model with a model trained with LoRA for Inference?

Below, I'm attaching my code for your convenience.

import numpy as np
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from peft import PeftModel, PeftConfig

class whisper:
    # model_str
    # 1. large - "openai/whisper-large-v3"
    # 2. medium - "openai/whisper-medium"
    # 3. small - "openai/whisper-small"
    def __init__(self, baseModelPath):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
        
        model = AutoModelForSpeechSeq2Seq.from_pretrained(baseModelPath, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
        model.to(device)

        processor = AutoProcessor.from_pretrained(baseModelPath)

        self.pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
        )

        
    # korean
    def getText(self, audioPath, language='<|ko|>'):
        sentence = self.pipe(audioPath, generate_kwargs={"task":"transcribe", "language":language})
        return sentence['text']

There is some information related to that link, but I don't know if it can be applied to the Whisper model.

Below are the files in the original directory.

-rw-r--r--  1 root root   34K Mar  4 17:49 added_tokens.json
-rw-r--r--  1 root root  1.4K Mar  5 09:48 config.json
-rw-r--r--  1 root root  3.0K Mar  5 09:48 generation_config.json
-rw-r--r--  1 root root  483K Mar  4 17:49 merges.txt
-rw-r--r--  1 root root  923M Mar  5 09:48 model.safetensors
-rw-r--r--  1 root root   52K Mar  4 17:49 normalizer.json
-rw-r--r--  1 root root  1.8G Mar  5 09:49 optimizer.pt
-rw-r--r--  1 root root   339 Mar  5 09:48 preprocessor_config.json
-rw-r--r--  1 root root   14K Mar  5 09:49 rng_state.pth
drwxr-xr-x  4 root root  4.0K Mar  4 17:49 runs
-rw-r--r--  1 root root  1.1K Mar  5 09:49 scheduler.pt
-rw-r--r--  1 root root  2.2K Mar  4 17:49 special_tokens_map.json
-rw-r--r--  1 root root  277K Mar  4 17:49 tokenizer_config.json
-rw-r--r--  1 root root   60K Mar  5 09:49 trainer_state.json
-rw-r--r--  1 root root  4.9K Mar  5 09:48 training_args.bin
-rw-r--r--  1 root root 1013K Mar  4 17:49 vocab.json

Below are the files in the LoRA-processed directory for the original files.

drwxr-xr-x  3 root root  4.0K Mar 21 06:50 .
drwxr-xr-x 11 root root  4.0K Mar 21 13:16 ..
-rw-r--r--  1 root root  5.0K Mar 21 06:13 README.md
-rw-r--r--  1 root root   789 Mar 21 06:13 adapter_config.json
drwxr-xr-x  2 root root  4.0K Mar 21 06:13 adapter_model
-rw-r--r--  1 root root   14M Mar 21 06:13 adapter_model.safetensors
-rw-r--r--  1 root root   34K Mar 20 12:55 added_tokens.json
-rw-r--r--  1 root root  483K Mar 20 12:55 merges.txt
-rw-r--r--  1 root root   52K Mar 20 12:55 normalizer.json
-rw-r--r--  1 root root   28M Mar 21 06:13 optimizer.pt
-rw-r--r--  1 root root   339 Mar 21 06:13 preprocessor_config.json
-rw-r--r--  1 root root   14K Mar 21 06:13 rng_state.pth
-rw-r--r--  1 root root  1.1K Mar 21 06:13 scheduler.pt
-rw-r--r--  1 root root  2.2K Mar 20 12:55 special_tokens_map.json
-rw-r--r--  1 root root  277K Mar 20 12:55 tokenizer_config.json
-rw-r--r--  1 root root   31K Mar 21 06:13 trainer_state.json
-rw-r--r--  1 root root  4.9K Mar 21 06:13 training_args.bin
-rw-r--r--  1 root root 1013K Mar 20 12:55 vocab.json

Solution

The following code allows you to infer by connecting the Whisper model with LoRA.

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import PeftModel

class whisper:
    def __init__(self, baseModelName, loraModelPath = '', cuda=True):
        # load model and processor
        self.processor = WhisperProcessor.from_pretrained(baseModelName)
        self.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="korean", task="transcribe")

        if cuda == True:
            self.model = WhisperForConditionalGeneration.from_pretrained(baseModelName).to("cuda")
        else:
            self.model = WhisperForConditionalGeneration.from_pretrained(baseModelName)

        if loraModelPath != '':
            self.model = PeftModel.from_pretrained(self.model, loraModelPath)