Download our latest MNC Answers Application at Play Store. Download Now

Unstructured Data Classification Hands-On Solution | TCS Fresco Play




Unstructured-Classification Hands-On Solutions



Disclaimer: The primary purpose of providing this solution is to assist and support anyone who are unable to complete these courses due to a technical issue or a lack of expertise. This website's information or data are solely for the purpose of knowledge and education.

Make an effort to understand these solutions and apply them to your Hands-On difficulties. (It is not advisable that copy and paste these solutions)

The course id of Unstructured-Classification is 55943

Install --> Test --> Run --> Open Preview

Copy url and paste in next tab

click on unstructured_test.ipynb


Step1:- 

import pandas as pd

import numpy as np

import csv


Step2:- 

#Data Loading

imdb=pd.read_csv("imdb.csv")

imdb.columns = ["index","text","label"]

print(imdb.head(5))


Step3:- 

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)

print(imdb.groupby('label').describe())

print(imdb.head(3))


Step4:- 

imdb_target=imdb['label'] 

print(imdb_target)


Step5:- 

from nltk.tokenize import word_tokenize

import nltk

nltk.download('all')

def split_tokens(text):

  text = text.lower()

  word_tokens = word_tokenize(text)

  return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row: split_tokens(row['text']), axis = 1)


Step 6:- 

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

    lemma = []

    lemmatizer = WordNetLemmatizer()

    for word in text:

        a=lemmatizer.lemmatize(word)

        lemma.append(a)

    return lemma

imdb['lemmatized_message'] = imdb.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55])

print('Lemmatized message:', imdb['lemmatized_message'][55])



Step 7:- 

from nltk.corpus import stopwords

def stopword_removal(text):

    stop_words = set(stopwords.words('english'))

    filtered_sentence = []

    filtered_sentence = ' '.join([word for word in text if word not in stop_words])

    return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row: stopword_removal(row['lemmatized_message']),axis = 1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))


Step 8:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)   

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)


Step 9:- 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = (1/len(Training_label)),max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)


Step 10:- 

from sklearn.model_selection import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,Training_label,test_size = 0.1)


Step 11:- 

seed=9

from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data", train_data_shape)

print("The shape of test data", test_data_shape )

classifier = SVC(kernel="linear",C=0.025,random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target = 

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:

    file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))


Step 12:-

from sklearn.linear_model import SGDClassifier

train_data,test_data, train_label, test_label = train_test_split( message_data_TDM, Training_label, test_size = 0.2)

train_data_shape = train_data.shape

test_data_shape = test_data.shape 

print("The shape of train data", train_data_shape  )

print("The shape of test data", test_data_shape )

classifier =  SGDClassifier( loss='modified_huber',shuffle = True, random_state = seed )

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:

    file.write(str((imdb['preprocessed_message'][55])))



If you have any queries, please feel free to ask on the comment section.

If you want MCQs and Hands-On solutions for any courses, Please feel free to ask on the comment section too.