Collectives™ on Stack Overflow
Find centralized, trusted content and collaborate around the technologies you use most.
Learn more about Collectives
Teams
Q&A for work
Connect and share knowledge within a single location that is structured and easy to search.
Learn more about Teams
Ask Question
I am trying to classify the email as spam and non spam. This is the
dataset
.
Following is the code
import pandas as pd
import re
import string
punct = string.punctuation
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/Spam Email .csv')
remove_tags = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
def text_pre_processing(data):
corpus = []
for i in range(len(data['MESSAGE'])):
print(i)
token = data['MESSAGE'][i].replace('\n',' ')
token = token.replace('\t',' ')
token = token.replace('©',' ')
token = token.replace('/b',' ')
token = re.sub('(https?:\/\/)([\w]+.)*',' ',token) # to remove url
token = re.sub('(www.)([\w]+.)*',' ',token)
token = re.sub(remove_tags, ' ', token)#remove html tags
token = "".join([word for word in token if word not in punct])
#token = re.sub(' {2,}','',token)
token = re.sub('([\d])*','',token) #remove numbers
token = token.lower()
token = word_tokenize(token)
token = " ".join([wordnet_lemmatizer.lemmatize(word) for word in token if not word in set(stopWords)])
corpus.append(token)
return corpus
text_transformer = FunctionTransformer(text_pre_processing)
pipeline_rfc = Pipeline([
('pre-processing', text_transformer),
('tfidf', TfidfVectorizer()),
('Random Forest Classifier',RandomForestClassifier(n_estimators = 400,random_state = 0))
X = data.iloc[:,1:2]
y = data.iloc[:,0:1]
xtrain,xtest,ytrain,ytest = train_test_split(X,y, test_size=0.3, random_state=42)
pipeline_rfc.fit(xtrain,ytrain)
I am able to access data['MESSAGE'][8] outside the function text_pre_processing(data) but inside the function it shows KeyError:8.
This is the error I am getting.
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 try:
-> 2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
Can anyone suggest what I am doing wrong? Any help is appreciated.
–
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.