Unstructured-Classification Hands-On Solutions

The course id of Unstructured-Classification is 55943

Install --> Test --> Run --> Open Preview

Copy url and paste in next tab

click on unstructured_test.ipynb

Step1:-

import pandas as pd

import numpy as np

import csv

Step2:-

imdb.columns = ["index","text","label"]

Step3:-

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)

print(imdb.groupby('label').describe())

Step4:-

imdb_target=imdb['label']

print(imdb_target)

Step5:-

from nltk.tokenize import word_tokenize

import nltk

def split_tokens(text):

text = text.lower()

word_tokens = word_tokenize(text)

return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row: split_tokens(row['text']), axis = 1)

Step 6:-

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

lemma = []

lemmatizer = WordNetLemmatizer()

for word in text:

a=lemmatizer.lemmatize(word)

lemma.append(a)

return lemma

imdb['lemmatized_message'] = imdb.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55])

print('Lemmatized message:', imdb['lemmatized_message'][55])

Step 7:-

from nltk.corpus import stopwords

def stopword_removal(text):

stop_words = set(stopwords.words('english'))

filtered_sentence = []

filtered_sentence = ' '.join([word for word in text if word not in stop_words])

return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row: stopword_removal(row['lemmatized_message']),axis = 1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

Step 8:-

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

Step 9:-

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)

Step 10:-

from sklearn.model_selection import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,Training_label,test_size = 0.1)

Step 11:-

seed=9

from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape)

print("The shape of test data", test_data_shape )

classifier = SVC(kernel="linear",C=0.025,random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target =

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:

file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))

Step 12:-

from sklearn.linear_model import SGDClassifier

train_data,test_data, train_label, test_label = train_test_split( message_data_TDM, Training_label, test_size = 0.2)

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape  )

print("The shape of test data", test_data_shape )

classifier =  SGDClassifier( loss='modified_huber',shuffle = True, random_state = seed )

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:

file.write(str((imdb['preprocessed_message'][55])))

