import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
[nltk_data] Downloading package stopwords to /Users/prm/nltk_data... [nltk_data] Package stopwords is already up-to-date!
# Data source: https://www.kaggle.com/datasets/gargmanas/sentimental-analysis-for-tweets?select=sentiment_tweets3.csv
tweets_df = pd.read_csv('/Users/prm/Documents/Portfolio/NLP_sentiment_analysis_of_reviews/sentiment_tweets3.csv')
tweets_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10314 entries, 0 to 10313 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 10314 non-null int64 1 message to examine 10314 non-null object 2 label (depression result) 10314 non-null int64 dtypes: int64(2), object(1) memory usage: 241.9+ KB
tweets_df.head()
Index | message to examine | label (depression result) | |
---|---|---|---|
0 | 106 | just had a real good moment. i missssssssss hi... | 0 |
1 | 217 | is reading manga http://plurk.com/p/mzp1e | 0 |
2 | 220 | @comeagainjen http://twitpic.com/2y2lx - http:... | 0 |
3 | 288 | @lapcat Need to send 'em to my accountant tomo... | 0 |
4 | 540 | ADD ME ON MYSPACE!!! myspace.com/LookThunder | 0 |
# Drop index column, as it is irrelevant
tweets_df = tweets_df.drop(['Index'], axis = 1)
tweets_df
message to examine | label (depression result) | |
---|---|---|
0 | just had a real good moment. i missssssssss hi... | 0 |
1 | is reading manga http://plurk.com/p/mzp1e | 0 |
2 | @comeagainjen http://twitpic.com/2y2lx - http:... | 0 |
3 | @lapcat Need to send 'em to my accountant tomo... | 0 |
4 | ADD ME ON MYSPACE!!! myspace.com/LookThunder | 0 |
... | ... | ... |
10309 | No Depression by G Herbo is my mood from now o... | 1 |
10310 | What do you do when depression succumbs the br... | 1 |
10311 | Ketamine Nasal Spray Shows Promise Against Dep... | 1 |
10312 | dont mistake a bad day with depression! everyo... | 1 |
10313 | 0 | 1 |
10314 rows × 2 columns
# Relabel columns with less cumbersome names
tweets_df = tweets_df.rename(columns={'label (depression result)': 'label', 'message to examine': 'message'})
# We see no null values
sns.heatmap(tweets_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
<AxesSubplot:>
# Histogram of different labels: 0 = not depressed, 1 = depressed
tweets_df.hist(bins = 5, figsize = (13,5))
array([[<AxesSubplot:title={'center':'label'}>]], dtype=object)
# Histogram of different labels: 0 = not depressed, 1 = depressed
sns.countplot(data=tweets_df, x='label')
<AxesSubplot:xlabel='label', ylabel='count'>
# Add a column with the length of each tweet
tweets_df['length'] = tweets_df['message'].apply(len)
tweets_df
message | label | length | |
---|---|---|---|
0 | just had a real good moment. i missssssssss hi... | 0 | 57 |
1 | is reading manga http://plurk.com/p/mzp1e | 0 | 42 |
2 | @comeagainjen http://twitpic.com/2y2lx - http:... | 0 | 84 |
3 | @lapcat Need to send 'em to my accountant tomo... | 0 | 135 |
4 | ADD ME ON MYSPACE!!! myspace.com/LookThunder | 0 | 45 |
... | ... | ... | ... |
10309 | No Depression by G Herbo is my mood from now o... | 1 | 101 |
10310 | What do you do when depression succumbs the br... | 1 | 97 |
10311 | Ketamine Nasal Spray Shows Promise Against Dep... | 1 | 96 |
10312 | dont mistake a bad day with depression! everyo... | 1 | 57 |
10313 | 0 | 1 | 1 |
10314 rows × 3 columns
# Summary statistics on the length column
tweets_df['length'].describe()
count 10314.000000 mean 92.616153 std 64.670905 min 1.000000 25% 48.000000 50% 78.000000 75% 119.000000 max 874.000000 Name: length, dtype: float64
# Tweets with minimum number of characters
tweets_df[tweets_df['length'] == 1]['message']
9606 2 10313 0 Name: message, dtype: object
# Tweets with maximum number of characters
tweets_df[tweets_df['length'] == 874]['message']
9410 @doucefeline @WhatASaviour7 @alisawoodard6 @SW... Name: message, dtype: object
# Tweets with mean number of characters
tweets_df[tweets_df['length'] == 92]['message']
84 Just a simple girl..oatmeal makes me really ha... 115 @kvanduyne I am happy you and mandy had a safe... 344 yes! the littlest houseguest is going home tod... 751 @scarletmandy i good...just woke up from Tweet... 1186 HAPPY MOTHER DAY FOR ALL THE STRONG HARD WORKI... ... 9879 Exercise can prevent depression, no matter you... 9968 Sadness and Depression are serious cases, you ... 10116 #mHealth App Aims to Help Caregivers Identify ... 10279 More than 1 in 20 US children and teens have a... 10282 Lil Prozac, a soundcloud rapper with songs mad... Name: message, Length: 76, dtype: object
#Histogram of length of tweets. I wonder what values above 280 mean. Is the dataset also grabbing threads??
sns.histplot(data=tweets_df, x='length', bins = 200)
<AxesSubplot:xlabel='length', ylabel='Count'>
text = ' '.join(tweets_df['message'].tolist())
# Generate the word cloud
wordcloud = WordCloud().generate(text)
# Plot the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# Separate dataset into positive and negative feeling tweets
positive = tweets_df[tweets_df['label'] == 0]
negative = tweets_df[tweets_df['label'] == 1]
positive
message | label | length | |
---|---|---|---|
0 | just had a real good moment. i missssssssss hi... | 0 | 57 |
1 | is reading manga http://plurk.com/p/mzp1e | 0 | 42 |
2 | @comeagainjen http://twitpic.com/2y2lx - http:... | 0 | 84 |
3 | @lapcat Need to send 'em to my accountant tomo... | 0 | 135 |
4 | ADD ME ON MYSPACE!!! myspace.com/LookThunder | 0 | 45 |
... | ... | ... | ... |
7995 | "Wow, What A Tight Fit" Lmao, Shutup. | 0 | 48 |
7996 | @theokk don't know what you could possibly mea... | 0 | 64 |
7997 | @shwood loved the cameo, made me chuckle! alon... | 0 | 98 |
7998 | lol. i just realized my room has a color theme... | 0 | 132 |
7999 | ReCoVeRiNg FrOm ThE lOnG wEeKeNd | 0 | 33 |
8000 rows × 3 columns
negative
message | label | length | |
---|---|---|---|
8000 | The lack of this understanding is a small but ... | 1 | 215 |
8001 | i just told my parents about my depression and... | 1 | 204 |
8002 | depression is something i don't speak about ev... | 1 | 264 |
8003 | Made myself a tortilla filled with pb&j. My de... | 1 | 86 |
8004 | @WorldofOutlaws I am gonna need depression med... | 1 | 196 |
... | ... | ... | ... |
10309 | No Depression by G Herbo is my mood from now o... | 1 | 101 |
10310 | What do you do when depression succumbs the br... | 1 | 97 |
10311 | Ketamine Nasal Spray Shows Promise Against Dep... | 1 | 96 |
10312 | dont mistake a bad day with depression! everyo... | 1 | 57 |
10313 | 0 | 1 | 1 |
2314 rows × 3 columns
text_positive = ' '.join(positive['message'].tolist())
text_negative = ' '.join(negative['message'].tolist())
# Generate the word cloud
wordcloud_positive = WordCloud().generate(text_positive)
wordcloud_negative = WordCloud().generate(text_negative)
# Plot the word cloud for positive feeling tweets
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis("off")
plt.show()
# Plot the word cloud for negative feeling tweets
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis("off")
plt.show()
# Pipeline for cleaning messages
# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords, (3) remove web addresses
def preprocess_text(df, column):
# Create a set of stopwords
stop_words = set(stopwords.words('english'))
# Define a pattern to match URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+')
# Define a pattern to match punctuation
punctuation_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
# Apply preprocessing steps to each element in the specified column
df[column] = df[column].apply(lambda x: url_pattern.sub('', x)) # Remove URLs
df[column] = df[column].apply(lambda x: punctuation_pattern.sub(' ', x)) # Remove punctuation
df[column] = df[column].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stop_words])) # Remove stopwords and convert to lowercase
return df
# Apply pipeline
preprocess_text(tweets_df, 'message')
message | label | length | |
---|---|---|---|
0 | real good moment missssssssss much | 0 | 57 |
1 | reading manga httpplurkcompmzp1e | 0 | 42 |
2 | comeagainjen httptwitpiccom2y2lx httpwwwyoutub... | 0 | 84 |
3 | lapcat need send em accountant tomorrow oddly ... | 0 | 135 |
4 | add myspace myspacecomlookthunder | 0 | 45 |
... | ... | ... | ... |
10309 | depression g herbo mood im done stressing peop... | 1 | 101 |
10310 | depression succumbs brain makes feel like youl... | 1 | 97 |
10311 | ketamine nasal spray shows promise depression ... | 1 | 96 |
10312 | dont mistake bad day depression everyone em | 1 | 57 |
10313 | 0 | 1 | 1 |
10314 rows × 3 columns
# Initialize the CountVectorizer
vectorizer = CountVectorizer()
# Fit the vectorizer to the text data
X = vectorizer.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized
00 | 007peter | 033654 | 040 | 05 | 05ccs | 06 | 0645 | 07958938647 | 09 | ... | ñƒñðºð | ñƒñˆð | ÿä | œë¼ê³ | šà | žã | žå | ˆç | ˆë | ˆìš | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10310 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10312 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10313 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10314 rows × 23601 columns
df_vectorized.shape
(10314, 23601)
# Define y variable that we want to predict
y_variable = tweets_df['label']
y_variable
0 0 1 0 2 0 3 0 4 0 .. 10309 1 10310 1 10311 1 10312 1 10313 1 Name: label, Length: 10314, dtype: int64
# Create training and testing subsets from total data set
xdata_train, xdata_test, ydata_train, ydata_test = train_test_split(df_vectorized, y_variable, test_size=0.2, random_state=42)
# Define Multinomial Naive Bayes model
NB_classifier = MultinomialNB()
NB_classifier.fit(xdata_train, ydata_train)
MultinomialNB()
# Deploy model
ydata_pred = NB_classifier.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()
print('Accuracy:', accuracy_score(ydata_test, ydata_pred))
precision recall f1-score support 0 0.99 0.97 0.98 1614 1 0.89 0.98 0.93 449 accuracy 0.97 2063 macro avg 0.94 0.97 0.96 2063 weighted avg 0.97 0.97 0.97 2063
Accuracy: 0.9694619486185168
# Define pipeline for tuning
pipeline = Pipeline([
('vectorizer', CountVectorizer()),
('model', MultinomialNB())
])
# Define parameter grid for grid search
parameters = {
'vectorizer__max_features': [500, 1000, 2000],
'model__alpha': [0.1, 1.0, 10.0]
}
# Define scorer for grid search
scorer = make_scorer(f1_score)
# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, scoring=scorer)
grid_search.fit(tweets_df['message'], tweets_df['label'])
# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
Best parameters: {'model__alpha': 10.0, 'vectorizer__max_features': 2000} Best score: 0.9616552191204564
# Initialize the CountVectorizer
vectorizer2 = CountVectorizer(max_features = 2000)
# Fit the vectorizer to the text data
X = vectorizer2.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer2.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized2 = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized2
10 | 100 | 1000 | 11 | 12 | 13 | 15 | 16 | 18 | 18002738255 | ... | youu | youve | yr | yrs | yum | yummy | yup | ๠| ë² | ðµ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10310 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10312 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10313 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10314 rows × 2000 columns
NB_classifier2 = MultinomialNB(alpha = 10)
NB_classifier2.fit(xdata_train, ydata_train)
MultinomialNB(alpha=10)
# Deploy model
ydata_pred = NB_classifier2.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()
print('Accuracy:', accuracy_score(ydata_test, ydata_pred))
precision recall f1-score support 0 0.97 1.00 0.99 1614 1 1.00 0.90 0.95 449 accuracy 0.98 2063 macro avg 0.98 0.95 0.97 2063 weighted avg 0.98 0.98 0.98 2063
Accuracy: 0.9777023751817742
We see a small accuracy increase with tuned hyperparameters, now with an increased number of Type II errors.