Spoken Language Processing in Python
Daniel Bourke
Machine Learning Engineer/YouTube creator
# Inspecteer map met post-purchase-audio
import os
post_purchase_audio = os.listdir("post_purchase")
print(post_purchase_audio[:5])
['post-purchase-audio-0.mp3',
'post-purchase-audio-1.mp3',
'post-purchase-audio-2.mp3',
'post-purchase-audio-3.mp3',
'post-purchase-audio-4.mp3']
# Loop door mp3-bestanden
for file in post_purchase_audio:
print(f"Converting {file} to .wav...")
# Gebruik eerder gemaakte functie om naar .wav te converteren
convert_to_wav(file)
Converting post-purchase-audio-0.mp3 to .wav...
Converting post-purchase-audio-1.mp3 to .wav...
Converting post-purchase-audio-2.mp3 to .wav...
Converting post-purchase-audio-3.mp3 to .wav...
Converting post-purchase-audio-4.mp3 to .wav...
# Transcribeer tekst uit wav-bestanden def create_text_list(folder):text_list = []# Loop door map for file in folder:# Check op .wav-extensie if file.endswith(".wav"):# Transcribeer audio text = transcribe_audio(file)# Voeg getranscribeerde tekst toe aan lijst text_list.append(text)return text_list
# Converteer post-purchase-audio naar tekst post_purchase_text = create_text_list(post_purchase_audio)print(post_purchase_text[:5])
['hey man I just water product from you guys and I think is amazing but I leave a little help setting it up',
'these clothes I just bought from you guys too small is there anyway I can change the size',
"I recently got these pair of shoes but they're too big can I change the size",
"I bought a pair of pants from you guys but they're way too small",
"I bought a pair of pants and they're the wrong colour is there any chance I can change that"]
import pandas as pd# Maak post-purchase-dataframe post_purchase_df = pd.DataFrame({"label": "post_purchase", "text": post_purchase_text})# Maak pre-purchase-dataframe pre_purchase_df = pd.DataFrame({"label": "pre_purchase", "text": pre_purchase_text})
# Combineer pre- en post-purchase
df = pd.concat([post_purchase_df, pre_purchase_df])
# Bekijk het gecombineerde dataframe
df.head()
label text
0 post_purchase yeah hello someone this morning delivered a pa...
1 post_purchase my shipment arrived yesterday but it's not the...
2 post_purchase hey my name is Daniel I received my shipment y...
3 post_purchase hey mate how are you doing I'm just calling in...
4 pre_purchase hey I was wondering if you know where my new p...
# Importeer tekstclassificatiepakketten
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
# Splits data in train- en testsets
X_train, X_test, y_train, y_test = train_test_split(
X=df["text"],
y=df["label"],
test_size=0.3)
# Maak een pipeline voor tekstclassificatie
text_classifier = Pipeline([
("vectorizer", CountVectorizer()),
("tfidf", TfidfTransformer()),
("classifier", MultinomialNB())
])
# Fit de pipeline op de trainingsdata
text_classifier.fit(X_train, y_train)
# Maak voorspellingen en vergelijk met testlabels predictions = text_classifier.predict(X_test)accuracy = 100 * np.mean(predictions == y_test.label) print(f"The model is {accuracy:.2f}% accurate.")
Het model is 97.87% accuraat.
Spoken Language Processing in Python