import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score

[nltk_data] Downloading package stopwords to /Users/prm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data source: https://www.kaggle.com/datasets/gargmanas/sentimental-analysis-for-tweets?select=sentiment_tweets3.csv
tweets_df = pd.read_csv('/Users/prm/Documents/Portfolio/NLP_sentiment_analysis_of_reviews/sentiment_tweets3.csv')


tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Index                      10314 non-null  int64 
 1   message to examine         10314 non-null  object
 2   label (depression result)  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


tweets_df.head()


# Drop index column, as it is irrelevant
tweets_df = tweets_df.drop(['Index'], axis = 1)
tweets_df


# Relabel columns with less cumbersome names
tweets_df = tweets_df.rename(columns={'label (depression result)': 'label', 'message to examine': 'message'})


# We see no null values
sns.heatmap(tweets_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")

<AxesSubplot:>


# Histogram of different labels: 0 = not depressed, 1 = depressed
tweets_df.hist(bins = 5, figsize = (13,5))

array([[<AxesSubplot:title={'center':'label'}>]], dtype=object)


# Histogram of different labels: 0 = not depressed, 1 = depressed
sns.countplot(data=tweets_df, x='label')

<AxesSubplot:xlabel='label', ylabel='count'>


# Add a column with the length of each tweet
tweets_df['length'] = tweets_df['message'].apply(len)
tweets_df


# Summary statistics on the length column
tweets_df['length'].describe()

count    10314.000000
mean        92.616153
std         64.670905
min          1.000000
25%         48.000000
50%         78.000000
75%        119.000000
max        874.000000
Name: length, dtype: float64


# Tweets with minimum number of characters
tweets_df[tweets_df['length'] == 1]['message']

9606     2
10313    0
Name: message, dtype: object


# Tweets with maximum number of characters
tweets_df[tweets_df['length'] == 874]['message']

9410    @doucefeline @WhatASaviour7 @alisawoodard6 @SW...
Name: message, dtype: object


# Tweets with mean number of characters
tweets_df[tweets_df['length'] == 92]['message']

84       Just a simple girl..oatmeal makes me really ha...
115      @kvanduyne I am happy you and mandy had a safe...
344      yes! the littlest houseguest is going home tod...
751      @scarletmandy i good...just woke up from Tweet...
1186     HAPPY MOTHER DAY FOR ALL THE STRONG HARD WORKI...
                               ...                        
9879     Exercise can prevent depression, no matter you...
9968     Sadness and Depression are serious cases, you ...
10116    #mHealth App Aims to Help Caregivers Identify ...
10279    More than 1 in 20 US children and teens have a...
10282    Lil Prozac, a soundcloud rapper with songs mad...
Name: message, Length: 76, dtype: object


#Histogram of length of tweets. I wonder what values above 280 mean. Is the dataset also grabbing threads??
sns.histplot(data=tweets_df, x='length', bins = 200)

<AxesSubplot:xlabel='length', ylabel='Count'>


text = ' '.join(tweets_df['message'].tolist())
# Generate the word cloud
wordcloud = WordCloud().generate(text)


# Plot the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


# Separate dataset into positive and negative feeling tweets
positive = tweets_df[tweets_df['label'] == 0]
negative = tweets_df[tweets_df['label'] == 1]


positive


negative


text_positive = ' '.join(positive['message'].tolist())
text_negative = ' '.join(negative['message'].tolist())
# Generate the word cloud
wordcloud_positive = WordCloud().generate(text_positive)
wordcloud_negative = WordCloud().generate(text_negative)


# Plot the word cloud for positive feeling tweets
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis("off")
plt.show()


# Plot the word cloud for negative feeling tweets
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis("off")
plt.show()


# Pipeline for cleaning messages
# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords, (3) remove web addresses

def preprocess_text(df, column):
    # Create a set of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Define a pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    # Define a pattern to match punctuation
    punctuation_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    
    # Apply preprocessing steps to each element in the specified column
    df[column] = df[column].apply(lambda x: url_pattern.sub('', x))  # Remove URLs
    df[column] = df[column].apply(lambda x: punctuation_pattern.sub(' ', x))  # Remove punctuation
    df[column] = df[column].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stop_words]))  # Remove stopwords and convert to lowercase
    
    return df


# Apply pipeline
preprocess_text(tweets_df, 'message')


# Initialize the CountVectorizer
vectorizer = CountVectorizer()
# Fit the vectorizer to the text data
X = vectorizer.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized


df_vectorized.shape

(10314, 23601)


# Define y variable that we want to predict
y_variable = tweets_df['label']
y_variable

0        0
1        0
2        0
3        0
4        0
        ..
10309    1
10310    1
10311    1
10312    1
10313    1
Name: label, Length: 10314, dtype: int64


# Create training and testing subsets from total data set
xdata_train, xdata_test, ydata_train, ydata_test = train_test_split(df_vectorized, y_variable, test_size=0.2, random_state=42)


# Define Multinomial Naive Bayes model
NB_classifier = MultinomialNB()
NB_classifier.fit(xdata_train, ydata_train)

MultinomialNB()


# Deploy model
ydata_pred = NB_classifier.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()

print('Accuracy:', accuracy_score(ydata_test, ydata_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1614
           1       0.89      0.98      0.93       449

    accuracy                           0.97      2063
   macro avg       0.94      0.97      0.96      2063
weighted avg       0.97      0.97      0.97      2063

Accuracy: 0.9694619486185168


# Define pipeline for tuning
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', MultinomialNB())
])

# Define parameter grid for grid search
parameters = {
    'vectorizer__max_features': [500, 1000, 2000],
    'model__alpha': [0.1, 1.0, 10.0]
}

# Define scorer for grid search
scorer = make_scorer(f1_score)

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, scoring=scorer)
grid_search.fit(tweets_df['message'], tweets_df['label'])

# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'model__alpha': 10.0, 'vectorizer__max_features': 2000}
Best score:  0.9616552191204564


# Initialize the CountVectorizer
vectorizer2 = CountVectorizer(max_features = 2000)
# Fit the vectorizer to the text data
X = vectorizer2.fit_transform(tweets_df['message'])
# Get the vocabulary of words
vocabulary = vectorizer2.get_feature_names_out()
# Convert the sparse matrix to a dense matrix
X_dense = X.toarray()
# Convert the dense matrix to a pandas DataFrame
df_vectorized2 = pd.DataFrame(X_dense, columns=vocabulary)
# Print the vectorized DataFrame
df_vectorized2


NB_classifier2 = MultinomialNB(alpha = 10)
NB_classifier2.fit(xdata_train, ydata_train)

MultinomialNB(alpha=10)


# Deploy model
ydata_pred = NB_classifier2.predict(xdata_test)
# Print classification report
print(classification_report(ydata_test, ydata_pred))
# Define confusion matrix
cm = confusion_matrix(ydata_test, ydata_pred)
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
heatmap = sns.heatmap(cm, annot = True, xticklabels=['Not depressed', 'Depressed'], yticklabels=['Not depressed', 'Depressed'])
# Show the plot
plt.show()

print('Accuracy:', accuracy_score(ydata_test, ydata_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1614
           1       1.00      0.90      0.95       449

    accuracy                           0.98      2063
   macro avg       0.98      0.95      0.97      2063
weighted avg       0.98      0.98      0.98      2063

Accuracy: 0.9777023751817742

	Index	message to examine
0	106	just had a real good moment. i missssssssss hi...
1	217	is reading manga http://plurk.com/p/mzp1e
2	220	@comeagainjen http://twitpic.com/2y2lx - http:...
3	288	@lapcat Need to send 'em to my accountant tomo...
4	540	ADD ME ON MYSPACE!!! myspace.com/LookThunder

	message	label	length
0	just had a real good moment. i missssssssss hi...	0	57
1	is reading manga http://plurk.com/p/mzp1e	0	42
2	@comeagainjen http://twitpic.com/2y2lx - http:...	0	84
3	@lapcat Need to send 'em to my accountant tomo...	0	135
4	ADD ME ON MYSPACE!!! myspace.com/LookThunder	0	45
...	...	...	...
7995	"Wow, What A Tight Fit" Lmao, Shutup.	0	48
7996	@theokk don't know what you could possibly mea...	0	64
7997	@shwood loved the cameo, made me chuckle! alon...	0	98
7998	lol. i just realized my room has a color theme...	0	132
7999	ReCoVeRiNg FrOm ThE lOnG wEeKeNd	0	33

	message	label	length
8000	The lack of this understanding is a small but ...	1	215
8001	i just told my parents about my depression and...	1	204
8002	depression is something i don't speak about ev...	1	264
8003	Made myself a tortilla filled with pb&j. My de...	1	86
8004	@WorldofOutlaws I am gonna need depression med...	1	196
...	...	...	...
10309	No Depression by G Herbo is my mood from now o...	1	101
10310	What do you do when depression succumbs the br...	1	97
10311	Ketamine Nasal Spray Shows Promise Against Dep...	1	96
10312	dont mistake a bad day with depression! everyo...	1	57
10313	0	1	1

Sentiment analysis of tweets regarding depression. Can you predict depression based on sentiment analysis of tweets?¶

Load packages¶

EDA¶

Make wordcloud¶

Data pre-processing: clean up text¶

Apply count vectorization¶

ML model to predict sentiment of tweets¶

Hyperparameter tuning¶

Count vectorization with tuned hyperparameters¶

Modeling with tuned hyperparameters¶

	message	label	length
0	real good moment missssssssss much	0	57
1	reading manga httpplurkcompmzp1e	0	42
2	comeagainjen httptwitpiccom2y2lx httpwwwyoutub...	0	84
3	lapcat need send em accountant tomorrow oddly ...	0	135
4	add myspace myspacecomlookthunder	0	45
...	...	...	...
10309	depression g herbo mood im done stressing peop...	1	101
10310	depression succumbs brain makes feel like youl...	1	97
10311	ketamine nasal spray shows promise depression ...	1	96
10312	dont mistake bad day depression everyone em	1	57
10313	0	1	1