Dealing with different kinds of audio

Spoken Language Processing in Python

Daniel Bourke

Machine Learning Engineer/YouTube Creator

What language?

# Create a recognizer class
recognizer = sr.Recognizer()

# Pass the Japanese audio to recognize_google
text = recognizer.recognize_google(japanese_good_morning, 
                                   language="en-US")

# Print the text
print(text)

Ohio gozaimasu

What language?

# Create a recognizer class
recognizer = sr.Recognizer()

# Pass the Japanese audio to recognize_google
text = recognizer.recognize_google(japanese_good_morning, 
                                   language="ja")

# Print the text
print(text)

?????????

Non-speech audio

# Import the leopard roar audio file
leopard_roar = sr.AudioFile("leopard_roar.wav")

# Convert the AudioFile to AudioData
with leopard_roar as source:
    leopard_roar_audio = recognizer.record(source)

# Recognize the AudioData
recognizer.recognize_google(leopard_roar_audio)

UnknownValueError:

Non-speech audio

# Import the leopard roar audio file
leopard_roar = sr.AudioFile("leopard_roar.wav")

# Convert the AudioFile to AudioData
with leopard_roar as source:
    leopard_roar_audio = recognizer.record(source)

# Recognize the AudioData with show_all turned on
recognizer.recognize_google(leopard_roar_audio,
                            show_all=True)

[]

Showing all

# Recognizing Japanese audio with show_all=True
text = recognizer.recognize_google(japanese_good_morning, 
                                   language="en-US",
                                   show_all=True)
# Print the text
print(text)

{'alternative': [{'transcript': 'Ohio gozaimasu', 'confidence': 0.89041114},
  {'transcript': 'all hail gozaimasu'},
  {'transcript': 'ohayo gozaimasu'},
  {'transcript': 'olho gozaimasu'},
  {'transcript': 'all Hale gozaimasu'}],
 'final': True}

Multiple speakers

# Import an audio file with multiple speakers
multiple_speakers = sr.AudioFile("multiple-speakers.wav")

# Convert AudioFile to AudioData
with multiple_speakers as source:
    multiple_speakers_audio = recognizer.record(source)

# Recognize the AudioData
recognizer.recognize_google(multiple_speakers_audio)

one of the limitations of the speech recognition library is that it doesn't
recognise different speakers and voices it will just return it all as one block 
of text

Multiple speakers

# Import audio files separately
speakers = [sr.AudioFile("s0.wav"), sr.AudioFile("s1.wav"), sr.AudioFile("s2.wav")]

# Transcribe each speaker individually
for i, speaker in enumerate(speakers):
    with speaker as source:
        speaker_audio = recognizer.record(source)
    print(f"Text from speaker {i}: {recognizer.recognize_google(speaker_audio)}")

Text from speaker 0: one of the limitations of the speech recognition library
Text from speaker 1: is that it doesn't recognise different speakers and voices
Text from speaker 2: it will just return it all as one block a text

Noisy audio

If you have trouble hearing the speech, so will the APIs

# Import audio file with background nosie
noisy_support_call = sr.AudioFile(noisy_support_call.wav)

with noisy_support_call as source:
    # Adjust for ambient noise and record
    recognizer.adjust_for_ambient_noise(source, 
                                        duration=0.5)
    noisy_support_call_audio = recognizer.record(source)

# Recognize the audio
recognizer.recognize_google(noisy_support_call_audio)

hello ID like to get some help setting up my calories

Let's practice!

Spoken Language Processing in Python