Sentiment analysis of tweets regarding depression. Can you predict depression based on sentiment analysis of tweets?¶

Load packages¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
[nltk_data] Downloading package stopwords to /Users/prm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

EDA¶

In [2]:
# Data source: https://www.kaggle.com/datasets/gargmanas/sentimental-analysis-for-tweets?select=sentiment_tweets3.csv
tweets_df = pd.read_csv('/Users/prm/Documents/Portfolio/NLP_sentiment_analysis_of_reviews/sentiment_tweets3.csv')
In [3]:
tweets_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Index                      10314 non-null  int64 
 1   message to examine         10314 non-null  object
 2   label (depression result)  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB
In [4]:
tweets_df.head()
Out[4]:
Index message to examine label (depression result)
0 106 just had a real good moment. i missssssssss hi... 0
1 217 is reading manga http://plurk.com/p/mzp1e 0
2 220 @comeagainjen http://twitpic.com/2y2lx - http:... 0
3 288 @lapcat Need to send 'em to my accountant tomo... 0
4 540 ADD ME ON MYSPACE!!! myspace.com/LookThunder 0
In [5]:
# Drop index column, as it is irrelevant
tweets_df = tweets_df.drop(['Index'], axis = 1)
tweets_df
Out[5]:
message to examine label (depression result)
0 just had a real good moment. i missssssssss hi... 0
1 is reading manga http://plurk.com/p/mzp1e 0
2 @comeagainjen http://twitpic.com/2y2lx - http:... 0
3 @lapcat Need to send 'em to my accountant tomo... 0
4 ADD ME ON MYSPACE!!! myspace.com/LookThunder 0
... ... ...
10309 No Depression by G Herbo is my mood from now o... 1
10310 What do you do when depression succumbs the br... 1
10311 Ketamine Nasal Spray Shows Promise Against Dep... 1
10312 dont mistake a bad day with depression! everyo... 1
10313 0 1

10314 rows × 2 columns

In [6]:
# Relabel columns with less cumbersome names
tweets_df = tweets_df.rename(columns={'label (depression result)': 'label', 'message to examine': 'message'})
In [8]:
# We see no null values
sns.heatmap(tweets_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
Out[8]:
<AxesSubplot:>
In [9]:
# Histogram of different labels: 0 = not depressed, 1 = depressed
tweets_df.hist(bins = 5, figsize = (13,5))
Out[9]:
array([[<AxesSubplot:title={'center':'label'}>]], dtype=object)
In [15]:
# Histogram of different labels: 0 = not depressed, 1 = depressed
sns.countplot(data=tweets_df, x='label')
Out[15]:
<AxesSubplot:xlabel='label', ylabel='count'>
In [18]:
# Add a column with the length of each tweet
tweets_df['length'] = tweets_df['message'].apply(len)
tweets_df
Out[18]:
message label length
0 just had a real good moment. i missssssssss hi... 0 57
1 is reading manga http://plurk.com/p/mzp1e 0 42
2 @comeagainjen http://twitpic.com/2y2lx - http:... 0 84
3 @lapcat Need to send 'em to my accountant tomo... 0 135
4 ADD ME ON MYSPACE!!! myspace.com/LookThunder 0 45
... ... ... ...
10309 No Depression by G Herbo is my mood from now o... 1 101
10310 What do you do when depression succumbs the br... 1 97
10311 Ketamine Nasal Spray Shows Promise Against Dep... 1 96
10312 dont mistake a bad day with depression! everyo... 1 57
10313 0 1 1

10314 rows × 3 columns

In [19]:
# Summary statistics on the length column
tweets_df['length'].describe()
Out[19]:
count    10314.000000
mean        92.616153
std         64.670905
min          1.000000
25%         48.000000
50%         78.000000
75%        119.000000
max        874.000000
Name: length, dtype: float64
In [20]:
# Tweets with minimum number of characters
tweets_df[tweets_df['length'] == 1]['message']
Out[20]:
9606     2
10313    0
Name: message, dtype: object
In [21]:
# Tweets with maximum number of characters
tweets_df[tweets_df['length'] == 874]['message']
Out[21]:
9410    @doucefeline @WhatASaviour7 @alisawoodard6 @SW...
Name: message, dtype: object
In [22]:
# Tweets with mean number of characters
tweets_df[tweets_df['length'] == 92]['message']
Out[22]:
84       Just a simple girl..oatmeal makes me really ha...
115      @kvanduyne I am happy you and mandy had a safe...
344      yes! the littlest houseguest is going home tod...
751      @scarletmandy i good...just woke up from Tweet...
1186     HAPPY MOTHER DAY FOR ALL THE STRONG HARD WORKI...
                               ...                        
9879     Exercise can prevent depression, no matter you...
9968     Sadness and Depression are serious cases, you ...
10116    #mHealth App Aims to Help Caregivers Identify ...
10279    More than 1 in 20 US children and teens have a...
10282    Lil Prozac, a soundcloud rapper with songs mad...
Name: message, Length: 76, dtype: object
In [34]:
#Histogram of length of tweets. I wonder what values above 280 mean. Is the dataset also grabbing threads??
sns.histplot(data=tweets_df, x='length', bins = 200)
Out[34]:
<AxesSubplot:xlabel='length', ylabel='Count'>

Make wordcloud¶

In [35]:
text = ' '.join(tweets_df['message'].tolist())
# Generate the word cloud
wordcloud = WordCloud().generate(text)
In [36]:
# Plot the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [37]:
# Separate dataset into positive and negative feeling tweets
positive = tweets_df[tweets_df['label'] == 0]
negative = tweets_df[tweets_df['label'] == 1]
In [38]:
positive
Out[38]:
message label length
0 just had a real good moment. i missssssssss hi... 0 57
1 is reading manga http://plurk.com/p/mzp1e 0 42
2 @comeagainjen http://twitpic.com/2y2lx - http:... 0 84
3 @lapcat Need to send 'em to my accountant tomo... 0 135
4 ADD ME ON MYSPACE!!! myspace.com/LookThunder 0 45
... ... ... ...
7995 &quot;Wow, What A Tight Fit&quot; Lmao, Shutup. 0 48
7996 @theokk don't know what you could possibly mea... 0 64
7997 @shwood loved the cameo, made me chuckle! alon... 0 98
7998 lol. i just realized my room has a color theme... 0 132
7999 ReCoVeRiNg FrOm ThE lOnG wEeKeNd 0 33

8000 rows × 3 columns

In [39]:
negative
Out[39]:
message label length
8000 The lack of this understanding is a small but ... 1 215
8001 i just told my parents about my depression and... 1 204
8002 depression is something i don't speak about ev... 1 264
8003 Made myself a tortilla filled with pb&j. My de... 1 86
8004 @WorldofOutlaws I am gonna need depression med... 1 196
... ... ... ...
10309 No Depression by G Herbo is my mood from now o... 1 101
10310 What do you do when depression succumbs the br... 1 97
10311 Ketamine Nasal Spray Shows Promise Against Dep... 1 96
10312 dont mistake a bad day with depression! everyo... 1 57
10313 0 1 1

2314 rows × 3 columns

In [44]:
text_positive = ' '.join(positive['message'].tolist())
text_negative = ' '.join(negative['message'].tolist())
# Generate the word cloud
wordcloud_positive = WordCloud().generate(text_positive)
wordcloud_negative = WordCloud().generate(text_negative)
In [41]:
# Plot the word cloud for positive feeling tweets
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis("off")
plt.show()
In [43]:
# Plot the word cloud for negative feeling tweets
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis("off")
plt.show()

Data pre-processing: clean up text¶

In [52]:
# Pipeline for cleaning messages
# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords, (3) remove web addresses

def preprocess_text(df, column):
    # Create a set of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Define a pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    # Define a pattern to match punctuation
    punctuation_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    
    # Apply preprocessing steps to each element in the specified column
    df[column] = df[column].apply(lambda x: url_pattern.sub('', x))  # Remove URLs
    df[column] = df[column].apply(lambda x: punctuation_pattern.sub(' ', x))  # Remove punctuation
    df[column] = df[column].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stop_words]))  # Remove stopwords and convert to lowercase
    
    return df
In [53]:
# Apply pipeline
preprocess_text(tweets_df, 'message')
Out[53]:
message label length
0 real good moment missssssssss much 0 57
1 reading manga httpplurkcompmzp1e 0 42
2 comeagainjen httptwitpiccom2y2lx httpwwwyoutub... 0 84
3 lapcat need send em accountant tomorrow oddly ... 0 135
4 add myspace myspacecomlookthunder 0 45
... ... ... ...
10309 depression g herbo mood im done stressing peop... 1 101
10310 depression succumbs brain makes feel like youl... 1 97
10311 ketamine nasal spray shows promise depression ... 1 96
10312 dont mistake bad day depression everyone em 1 57
10313 0 1 1

10314 rows × 3 columns

Apply count vectorization¶

In [54]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()
# Fit the vectorizer to the text data
X = vectorizer.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized
Out[54]:
00 007peter 033654 040 05 05ccs 06 0645 07958938647 09 ... ñƒñðºð ñƒñˆð ÿä œë¼ê³ šà žã žå ˆç ˆë ˆìš
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10309 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10310 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10311 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10312 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10313 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10314 rows × 23601 columns

In [55]:
df_vectorized.shape
Out[55]:
(10314, 23601)

ML model to predict sentiment of tweets¶

In [56]:
# Define y variable that we want to predict
y_variable = tweets_df['label']
y_variable
Out[56]:
0        0
1        0
2        0
3        0
4        0
        ..
10309    1
10310    1
10311    1
10312    1
10313    1
Name: label, Length: 10314, dtype: int64
In [58]:
# Create training and testing subsets from total data set
xdata_train, xdata_test, ydata_train, ydata_test = train_test_split(df_vectorized, y_variable, test_size=0.2, random_state=42) 
In [60]:
# Define Multinomial Naive Bayes model
NB_classifier = MultinomialNB()
NB_classifier.fit(xdata_train, ydata_train)
Out[60]:
MultinomialNB()
In [61]:
# Deploy model
ydata_pred = NB_classifier.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()

print('Accuracy:', accuracy_score(ydata_test, ydata_pred))
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1614
           1       0.89      0.98      0.93       449

    accuracy                           0.97      2063
   macro avg       0.94      0.97      0.96      2063
weighted avg       0.97      0.97      0.97      2063

Accuracy: 0.9694619486185168

Hyperparameter tuning¶

In [62]:
# Define pipeline for tuning
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', MultinomialNB())
])

# Define parameter grid for grid search
parameters = {
    'vectorizer__max_features': [500, 1000, 2000],
    'model__alpha': [0.1, 1.0, 10.0]
}

# Define scorer for grid search
scorer = make_scorer(f1_score)

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, scoring=scorer)
grid_search.fit(tweets_df['message'], tweets_df['label'])

# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
Best parameters:  {'model__alpha': 10.0, 'vectorizer__max_features': 2000}
Best score:  0.9616552191204564

Count vectorization with tuned hyperparameters¶

In [66]:
# Initialize the CountVectorizer
vectorizer2 = CountVectorizer(max_features = 2000)
# Fit the vectorizer to the text data
X = vectorizer2.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer2.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized2 = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized2
Out[66]:
10 100 1000 11 12 13 15 16 18 18002738255 ... youu youve yr yrs yum yummy yup à¹ ë² ðµ
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10309 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10310 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10311 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10312 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10313 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10314 rows × 2000 columns

Modeling with tuned hyperparameters¶

In [67]:
NB_classifier2 = MultinomialNB(alpha = 10)
NB_classifier2.fit(xdata_train, ydata_train)
Out[67]:
MultinomialNB(alpha=10)
In [69]:
# Deploy model
ydata_pred = NB_classifier2.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()

print('Accuracy:', accuracy_score(ydata_test, ydata_pred))
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1614
           1       1.00      0.90      0.95       449

    accuracy                           0.98      2063
   macro avg       0.98      0.95      0.97      2063
weighted avg       0.98      0.98      0.98      2063

Accuracy: 0.9777023751817742

We see a small accuracy increase with tuned hyperparameters, now with an increased number of Type II errors.

In [ ]: