I am trying to send a request to openai's speech to text api in my vapor app. The api accepts multipart/form-data requests. I couldn't figure out how to use vapor's client api. For json requests one can send request fairly easyly.
let resp = try await client.post(
"https://api.openai.com/v1/chat/completions",
headers: HTTPHeaders([
("Content-Type", "application/json"),
("Authorization", "Bearer \(memoKey)")
]),
content: reqData
)
For multipart form data I've tried this but the api gives Could not parse multipart form
error
struct SpeechToTextRequest: Content {
var model = "whisper-1"
var file: Data
}
func makeSpeechToTextRequest(
client: Client,
audio: Data
) async throws {
let result = try await client.post(
"https://api.openai.com/v1/audio/transcriptions",
headers: [
"Content-Type": "multipart/form-data",
"Authorization": "Bearer \(memoKey)"
],
beforeSend: { req in
let encoder = FormDataEncoder()
let encoded = try encoder.encode(
SpeechToTextRequest(file: audio),
boundary: ""
)
req.body = ByteBuffer(string: encoded)
}
)
print(result)
}
For reference here is the curl string for the request
curl --request POST \
--url https://api.openai.com/v1/audio/transcriptions \
--header "Authorization: Bearer $OPENAI_API_KEY" \
--header 'Content-Type: multipart/form-data' \
--form file=@/path/to/file/openai.mp3 \
--form model=whisper-1
I've ended up creating a little function to create multipart form data. I couldn't find any docs about Vapor's MultiPartKit library.
Here is how request is constructed:
func makeSpeechToTextRequest(
client: Client,
audio: Data
) async throws -> SpeechToTextResponse {
let result = try await client.post(
"https://api.openai.com/v1/audio/transcriptions",
headers: [
"Authorization": "Bearer \(memoKey)"
],
beforeSend: { req in
let (body, contentType) = createMultipartFormData(from: [
.file(fileName: "speech.mp3", fileType: "audio/mp3", fileData: audio),
.string(name: "model", value: "whisper-1"),
.string(name: "response_format", value: "verbose_json"),
.string(name: "timestamp_granularities[]", value: "word")
])
req.body = body
req.headers.contentType = contentType
}
)
return try result.content.decode(SpeechToTextResponse.self)
}
And here is the helper function:
private enum MultipartField {
case string(name: String, value: String)
case file(fileName: String, fileType: String, fileData: Data)
}
private func createMultipartFormData(from fields: [MultipartField]) -> (ByteBuffer, HTTPMediaType) {
let boundary = UUID().uuidString
var buffer = ByteBuffer()
for field in fields {
switch field {
case let .file(fileName, fileType, fileData):
buffer.writeString("--\(boundary)\r\n")
buffer.writeString("Content-Disposition: form-data; name=\"file\"; filename=\"\(fileName)\"\r\n")
buffer.writeString("Content-Type: \(fileType)\r\n\r\n")
buffer.writeData(fileData)
buffer.writeString("\r\n")
case let .string(name, value):
buffer.writeString("--\(boundary)\r\n")
buffer.writeString("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
buffer.writeString("\(value)\r\n")
}
}
buffer.writeString("--\(boundary)--\r\n")
let mediaType = HTTPMediaType(
type: "multipart",
subType: "form-data",
parameters: ["boundary": boundary]
)
return (buffer, mediaType)
}