This is the second tutorial of Tweet Sentiment Analysis with Logistic Regression series. Check out the first section if you haven't yet. In this tutorial, we will use another method to create features from sentences. This method is frequency count. This method creates only 3 features rather than 26233 using one-hot!
Frequency count: Number of times a word appear in particular class corpus. Watch this video for clear explanation. After that, watch this video for feature extraction using word frequencies.
The data loading process is the same as previous.
# uncomment below line to install dependencies
# !pip install numpy pandas scikit-learn nltk
import re, nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
from collections import Counter
# uncomment below line to download the dataset
nltk.download('twitter_samples')
# select the set of positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
print('Number of positive tweets: ', len(positive_tweets))
print('Number of negative tweets: ', len(negative_tweets))
print('\nThe type of all_positive_tweets is: ', type(positive_tweets))
print('The type of a tweet entry is: ', type(negative_tweets[0]))
# Let's look at an example tweet
print("Positive example ->", positive_tweets[0])
print()
print("Negative example ->", negative_tweets[0])
I am using a pandas dataframe for easy data management.
posdf = pd.DataFrame(positive_tweets, columns=["tweet"])
posdf["target"] = 1
negdf = pd.DataFrame(negative_tweets, columns=["tweet"])
negdf["target"] = 0
# Combine both dataframes
df = pd.concat([posdf, negdf])
df.shape
df.sample(6)
Do some cleaning such as converting all text to lowercase, remove hashtags, extra spaces, etc. For the sake of simplicity, I am performing simple cleaning only.
def preprocessing(tweet):
# lowercase
tweet = tweet.lower().strip()
# remove hashtags
tweet = re.sub(r'#', '', tweet)
return tweet
df["tweet"] = df.tweet.apply(preprocessing)
df.sample(6)
Shuffle the dataframe and split the data into train and validation set
from sklearn.model_selection import train_test_split
traindf, valdf = train_test_split(df, shuffle=True)
print("Shape of train and val set:", traindf.shape, valdf.shape)
# Verify the classes of both splits
print("Samples distribution in train set:", dict(traindf.target.value_counts()))
print("Samples distribution in val set:", dict(valdf.target.value_counts()))
Now let's build word frequencies using training set...
def build_freqs_dict(df):
freqs = {}
for i in range(len(df)):
row = df.iloc[i]
y = row.target
for word in row.tweet.split(" "):
pair = (word, y)
freqs[pair] = freqs.get(pair, 0) + 1
# The above line is equivalent to if else below but compact
# if pair in freqs: freqs[pair] += 1
# else: freqs[pair] = 1
return freqs
%%time
freqs = build_freqs_dict(traindf)
Now, let's make features from the frequencies... There will be three features for every tweet:
def make_features(tweet, freqs):
# Initialize a zeros array with size 3
feats = np.zeros(3, dtype=int)
# set bias term to 1
feats[0] = 1
for word in tweet.split(" "):
# Set positive frequencies count
if (word, 1) in freqs.keys(): feats[1] += freqs[(word, 1)]
# Set negative frequencies count
if (word, 0) in freqs.keys(): feats[2] += freqs[(word, 0)]
assert(feats.shape == (3,))
return feats
# Test make_features function
sample = traindf.tweet.iloc[10]
sample_feats = make_features(sample, freqs)
print("Sample tweet:", sample)
print("Sample features:", sample_feats)
Make features and train and test model
from functools import partial
from sklearn.linear_model import LogisticRegression
X_train = traindf.tweet.apply(partial(make_features, freqs=freqs))
X_train = np.stack(X_train.values)
y_train = traindf.target.values
X_val = valdf.tweet.apply(partial(make_features, freqs=freqs))
X_val = np.stack(X_val.values)
y_val = valdf.target.values
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
%%time
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Train accuracy:", clf.score(X_train, y_train))
print("Validation accuracy:", clf.score(X_val, y_val))
These are acceptable results. However, we can improve our model by stemming and removing stop words and punctuation.
Let's do it...
We already lowercased and removed hastags. Now, we will perform:
.split
methodfrom nltk.corpus import stopwords # module for stop words that come with NLTK
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings
from string import punctuation # common punctuations
# uncomment below 2 lines to download stopwords
nltk.download('stopwords')
stopwords_english = stopwords.words('english')
def preprocessing2(tweet, tokenizer):
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)\
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
# tokenize
tweet = tokenizer.tokenize(tweet)
# remove stop words and punctuations
clean_tweet = [ # Go through every word in tokens list
word for word in tweet
if ((word not in stopwords_english) and (word not in punctuation))
]
# Make a string from tokens
clean_tweet = " ".join(clean_tweet)
return clean_tweet
# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
%%time
df["tweet"] = df.tweet.apply(partial(preprocessing2, tokenizer=tokenizer))
df.sample(6)
Split data, build freqs, and train
# Split data
traindf, valdf = train_test_split(df, shuffle=True, test_size=.20)
print("Shape of train and val set:", traindf.shape, valdf.shape)
# Verify the classes of both splits
print("Samples distribution in train set:", dict(traindf.target.value_counts()))
print("Samples distribution in val set:", dict(valdf.target.value_counts()))
%%time
# Make freqs
freqs = build_freqs_dict(traindf)
%%time
# Make features
X_train = traindf.tweet.apply(partial(make_features, freqs=freqs))
X_train = np.stack(X_train.values)
y_train = traindf.target.values
X_val = valdf.tweet.apply(partial(make_features, freqs=freqs))
X_val = np.stack(X_val.values)
y_val = valdf.target.values
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
%%time
# Train and test
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Train accuracy:", clf.score(X_train, y_train))
print("Validation accuracy:", clf.score(X_val, y_val))
WoW! we gain a significant improvement with better preprocessing/
Now. let's test our model with our text...
def test(sent):
sent = preprocessing2(sent, tokenizer)
test_feats = make_features(sent, freqs)
y_pred = clf.predict(test_feats.reshape(1,-1))
if y_pred[0] == 1: return "positive"
elif y_pred[0] == 0: return "negative"
else: return None
test("I am happy about the results")
test("This worked out fine.")
print(test("I lost my phone.")) # This should be negative
# Why this is positive?
print(test("lost")) # returns positive.
# This might be because the word "lost" is present in the positive freqs for some reason.
# This could be a drawback of using word frequencies
test("Julia broke up with John") # What? that's not good
test("I forgot my lunch at home")
test("I'm sick")
test("this is a sick beat")
test("Get away from me")
So frequencies count method is significantly faster and more accurate (at least in validation set). However, further insights are required. Also, we might need more complex features for difficult datasets.