The code: spam mail prediction
#libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
#data collection and pre-processing
raw_mail_data=pd.read_csv("mail_data.csv")
mail_data=raw_mail_data.where( (pd.notnull(raw_mail_data)),"" )
#label encoding
#label ham mail as 1, spam mail as 0.
mail_data.loc[mail_data["Category"]=="spam","Category"]=0
mail_data.loc[mail_data["Category"]=="ham","Category"]=1
#seperating the data set text and labels
X=mail_data["Message"] #input
Y=mail_data["Category"] #output
#Feature Extraction
#transform the text data to feature vectors that can be used as input to the logistic regression
feature_extraction= TfidfVectorizer(min_df=1,stop_words="english",lowercase="True")
# min_df=1= if the particular score is less than 1 we can ignore it, that's not useful for prediction
# stop_words wil be ignored
X_train_features=feature_extraction.fit_transform(X_train) #text to numerical values
X_test_features=feature_extraction.transform(X_test) #not neccessary to fit again
#convert Y_train and Y_test Values as integers (be not dtype: object type)
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')
#lojistic regression
model = LogisticRegression()
model.fit(X_train_features,Y_train)
#Evaluating the trained model
## prediction on training data
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)
print("Accuracy on training data:",accuracy_on_training_data)
#Evaluating the trained model
## prediction on test data
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)
print("Accuracy on test data:",accuracy_on_test_data)
# Accuracy on training data and Accuracy on test data are very similar values. that's mean there is no overfitting
# if Accuracy on test data>Accuracy on training data then we can mention overfitting
# if Accuracy on test data is very low then we can mention underfitting.
inputs=input("please type a message.")
#Building a Predictive System
input_mail=[str(inputs)]
# convert input string to the the numerical value
input_data_features=feature_extraction.transform(input_mail)
print("input_data_features:",input_data_features)
#making predictions
prediction=model.predict(input_data_features)
print("prediction:",prediction)
if prediction[0]==1:
print("Normal mail",prediction[0])
elif prediction[0]==0:
print("spam mail",prediction[0])
else:
print("unknown condition")
#HAM:1
#SPAM:0
Even though I enter spam mail content as input (discount, offer,- etc), I can't get a result of 0. (ie spam mail) The code cannot guess correctly. It always gives 1 result. What is the reason for this? The accuracy scores look normal (
