import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
df = pd.read_excel("BankReviews.xlsx")
df.head()
# Dropping the irrelevant variables
df.drop(['Date','BankName'],axis=1,inplace=True)
df.head(3)
df.shape
X,y=df.Reviews,df.Stars
X.head()
y.head()
# Pickling the dataset
import pickle
with open('Review.pickle','wb') as f:
pickle.dump(X,f)
with open('sent.pickle','wb') as f:
pickle.dump(y,f)
# Unpickling dataset
X_in = open('Review.pickle','rb')
y_in = open('sent.pickle','rb')
X = pickle.load(X_in)
y = pickle.load(y_in)
# Creating the corpus
import re
corpus = []
for i in range(0, len(X)):
review = re.sub(r'\W', ' ', str(X[i]))
review = review.lower()
review = re.sub(r'^br$', ' ', review)
review = re.sub(r'\s+[a-z]\s+', ' ',review)
review = re.sub(r'^[a-z]\s+', '', review)
review = re.sub(r'\s+', ' ', review)
corpus.append(review)
X[0]
corpus[1]
from nltk.stem import WordNetLemmatizer
# sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()
# Lemmatization
for i in range(len(corpus)):
words = nltk.word_tokenize(corpus[i])
words = [lemmatizer.lemmatize(word) for word in words]
corpus[i] = ' '.join(words)
# Creating the Tf-Idf model
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 0.05, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
X
X.shape
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(X,
y,
test_size=.3,
random_state=42)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit( train_x, train_y )
#Evelution of train and test accuracy
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,logreg.predict(test_x))
print('test_accuracy: ',test_accuracy)
train_accuracy=metrics.accuracy_score(train_y,logreg.predict(train_x))
print('train_accuracy: ',train_accuracy)
# Creating a confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(test_y,
logreg.predict(test_x), [1,5] )
cm
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
sn.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["1", "5"] , yticklabels = ["1", "5"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
As we can see from confusion matrix our model is not over fitted
#Concatenating final prediction with original data set(all obersvations):
pred_stars=pd.DataFrame(logreg.predict(X),columns=['predicted_stars'])
testfile = pd.concat([df, pred_stars], axis=1)
testfile.head(10)
#Exporting testfile to csv for final submission
testfile.to_csv('Review_submission.csv',index=False)