Cluster Analysis in Python
Shaumik Daityari
Business Analyst
from nltk.tokenize import word_tokenize import re def remove_noise(text, stop_words = []): tokens = word_tokenize(text)
cleaned_tokens = [] for token in tokens: token = re.sub('[^A-Za-z0-9]+', '', token)
if len(token) > 1 and token.lower() not in stop_words: # Get lowercase cleaned_tokens.append(token.lower()) return cleaned_tokens
remove_noise("It is lovely weather we are having. I hope the weather continues.")
['lovely', 'weather', 'hope', 'weather', 'continues']
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=50, min_df=0.2, tokenizer=remove_noise)
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
kmeans()
in SciPy does not support sparse matrices.todense()
to convert to a matrixcluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)
terms = tfidf_vectorizer.get_feature_names_out() for i in range(num_clusters): center_terms = dict(zip(terms, list(cluster_centers[i])))
sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
print(sorted_terms[:3])
['room', 'hotel', 'staff']
['bad', 'location', 'breakfast']
.todense()
may not work with large datasetsCluster Analysis in Python