Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions BT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
'''
def overlapping(question):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
#open file and selecting sheet
file='overlapping.xlsx'
sheet=pd.ExcelFile(file)
df_train=sheet.parse("Sheet2")

def training(df_train):

label_encoding=preprocessing.LabelEncoder()
df_train['CategoryLabel']=label_encoding.fit_transform(df_train['Category'])
question_train=df_train['Question']
category_train=df_train['CategoryLabel']
vectorizer = CountVectorizer()
#training set
X_train = vectorizer.fit_transform(question_train).toarray()
y_train=list(category_train)
#linear SVC
clf=LinearSVC()
clf.fit(X_train,y_train)
return vectorizer ,clf,label_encoding

def testing (question,vectorizer,clf,label_encoding):
X_test= vectorizer.transform(question)
y_test=clf.predict(X_test)
category=label_encoding.inverse_transform(y_test)
question.append(category[0])
return(question)

vectorizer ,clf,label_encoding=training(df_train)
output=testing(question,vectorizer,clf,label_encoding)
return output

op=overlapping(["explain the concept of mobile ip"]) #each question should be an array

'''
########
import csv
from nltk import *
import os
import nltk
import csv,re
import pandas as pd
dataset='test.csv'
verblist="verb_list.csv"
import spacy
def SelectVerbCategory(verblist,verb):
category=[]
verb=verb.lower()
with open(verblist) as File:
reader=csv.reader(File)
for row in reader:
for word in row:
if word==verb:
category.append(row[0])

if len(set(category)) ==0:
return("Not in list")
elif len(set(category)) ==1:
return(category[0])
else:
return("Overlapping")
File.close()

nlp=spacy.load('en')
question=[]
Category=[]
def processContent(dataset):
try:
with open(dataset,'r+') as dataset:
csvReader=csv.reader(dataset)
for row in dataset:
row1=nlp(row)
for token in row1:
if token.pos_=='VERB' or token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB":
verb=token.text
category=SelectVerbCategory(verblist,verb)
question.append(row.strip('\n'))
Category.append(category)
break
dataset.close()
except Exception as e:
print(str(e))
processContent(dataset)

df=pd.DataFrame({'Questions':question,'Category':Category})




Binary file added BT/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file added BT/__pycache__/settings.cpython-34.pyc
Binary file not shown.
Binary file added BT/__pycache__/urls.cpython-34.pyc
Binary file not shown.
Binary file added BT/__pycache__/wsgi.cpython-34.pyc
Binary file not shown.
55 changes: 55 additions & 0 deletions NonOverlapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import csv
from nltk import *
import os
import nltk
import csv,re
dataset='dataset.csv'
output_file='dataset_output.csv'
verblist="verb_list.csv"

def WriteToFile(output_file,content):
print(content)
with open(output_file,"a") as OP:
writer=csv.writer(OP)
writer.writerow([content])

def SelectVerbCategory(verblist,verb):
category=[]
verb=verb.lower()
with open(verblist) as File:
reader=csv.reader(File)
for row in reader:
for word in row:
if word==verb:
category.append(row[0])

if len(set(category)) ==0:
return("Not in list")
elif len(set(category)) ==1:
return(category[0])
else:
return("Overlapping")
File.close()


def processContent(dataset):
try:
with open(dataset,'r+') as dataset:
csvReader=csv.reader(dataset)
for row in dataset:
tokenData=sent_tokenize(row) #sentence tokennizer used to split in case of multiplequestions in same sub-questions
if len(tokenData) ==1:
word_token=word_tokenize(row)
pos=pos_tag(word_token)
for (word,tag) in pos:
if re.match(r"VB|WP",tag):
category=SelectVerbCategory(verblist,word)
content=row.strip("\n")+","+category
WriteToFile(output_file,content)
dataset.close()
except Exception as e:
print(str(e))

processContent(dataset)


Binary file added classifier/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file added classifier/__pycache__/admin.cpython-34.pyc
Binary file not shown.
Binary file added classifier/__pycache__/apps.cpython-34.pyc
Binary file not shown.
Binary file added classifier/__pycache__/models.cpython-34.pyc
Binary file not shown.
Binary file added classifier/__pycache__/urls.cpython-34.pyc
Binary file not shown.
Binary file added classifier/__pycache__/views.cpython-34.pyc
Binary file not shown.
Binary file added classifier/bcl/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file added classifier/bcl/__pycache__/bcl.cpython-34.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
545 changes: 545 additions & 0 deletions dataset.csv

Large diffs are not rendered by default.

822 changes: 822 additions & 0 deletions dataset_output.csv

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions fileio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import csv
def SelectVerbCategory(verblist,verb):
category=[]
verb=verb.lower()
with open(verblist) as File:
reader=csv.reader(File)
for row in reader:
for word in row:
if word==verb:
category.append(row[0])
File.close()
if len(set(category)) ==0:
return("Not a verb")
elif len(set(category)) ==1:
return(category[0])
else:
return("Overlapping")

SelectVerbCategory("non_overlapping_verbs.csv","Who")

def NonOverlapping(dataset,output_file):
with open(dataset,"r") as IP:
reader=csv.reader(IP)
with open(output_file,"w") as OP:
writer=csv.writer(OP)
for row in reader:
writer.writerow([row[0],row[1],row[2],verb])
6 changes: 6 additions & 0 deletions non_overlapping_verbs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Remembering ,find,omit,define,how,label,match,name,recall,spell,tell,what,when,where,which,who,why
Understanding,interpret,demonstrate,extend,illustrate,infer,outline,rephrase,summarize,translate
Applying ,make use of ,organize,apply,experiment with,interview,model,utilize
Analysing,categorize,conclusion,analyze,discover,dissect,distinguish,divide,examine,function,inference,inspect,motive,relationships,simplify,survey,take part in,test for,theme
Evaluating ,appraise,assess,award,conclude,agree,criteria,criticize,decide,deduct,defend,determine,disprove,evaluate,importance,influence,interpret,judge,mark,measure,perceive,prioritize,rate,recommend,support
Creating,change,combine,compile,compose,create,delete,design,discuss,elaborate,formulate,happen,imagine,improve,invent,make up,maximize,minimize,modify,original,originate,predict,propose,solution,suppose,test,theory
29 changes: 29 additions & 0 deletions overlapping/CountVectoriser+LinearSVC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import spacy
from sklearn.svm import LinearSVC
nlp=spacy.load('en')
#open file00
file='non-overlapping.xlsx'
sheet=pd.ExcelFile(file)
df=sheet.parse("Sheet1")
#selecting category
#df['Category']=np.where(df['Category'] =="Remembering" ,1, 0)
question_train, question_test, category_train, category_test = train_test_split(df['Question'], df['Category'], test_size=0.3, random_state=45)

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(question_train).toarray()
y_train=list(category_train)
X_test=vectorizer.transform(question_test).toarray()
y_test=list(map(int, category_test))
clf=LinearSVC(multi_class='crammer_singer')
clf.fit(X_train,y_train)
y_pred=list(map(int,clf.predict(X_test)))



from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
Empty file added overlapping/Svc+token.py
Empty file.
45 changes: 45 additions & 0 deletions overlapping/countVectoriser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import spacy

from sklearn.svm import SVC
nlp=spacy.load('en')
#open file
file='non-overlapping.xlsx'
sheet=pd.ExcelFile(file)
df=sheet.parse("Sheet1")
#selecting category
df['Category']=np.where(df['Category'] =="Remembering" ,1, 0)
np_array=df.values
question_train=np_array[:len(np_array)-10,0]
category_train=np_array[:len(np_array)-10,2]

question_test=np_array[-10:,0]
category_test=np_array[-10:,2]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab

#vectorizer.fit(list(question_train) + list(question_test))
vectorizer.fit(question_train)

# encode document

X_train = vectorizer.transform(question_train).toarray()
# summarize encoded vector

y_train=list(category_train)

X_test=vectorizer.transform(question_test).toarray()
y_test=list(map(int, category_test))
#SVC classifier

from collections import Counter
clf=SVC()


clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score)
Binary file added overlapping/dataset.xlsx
Binary file not shown.
Binary file added overlapping/non-overlapping.xlsx
Binary file not shown.
73 changes: 73 additions & 0 deletions overlapping/overlapping+SVC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC

#open file and selecting sheet
file='overlapping.xlsx'
sheet=pd.ExcelFile(file)
df=sheet.parse("Sheet2")

def labelEncoding(df):

label_encoding=preprocessing.LabelEncoder()
df['CategoryLabel']=label_encoding.fit_transform(df['Category'])

return df
label_encode=labelEncoding(df)
#writing the labels for categories into file
def WriteToFile(df):
np_array=df.values
writer =pd.ExcelWriter('overlapping.xlsx')
df.to_excel(writer,'Sheet2')
writer.save()
def splitSet(df):
from sklearn.model_selection import train_test_split
question_train, question_test, category_train, category_test = train_test_split(df['Question'], df['CategoryLabel'], test_size=0.2,random_state=50)
splitset=[question_train,question_test,category_train,category_test]
return splitset
split=splitSet(label_encode)
question_train=split[0]
question_test=split[1]
category_train=split[2]
category_test=split[3]
def Vectoriser(question_train,question_test,category_train,category_test):
#Count Vectoriser
vectorizer = CountVectorizer()

#training set
X_train = vectorizer.fit_transform(question_train).toarray()
y_train=list(category_train)

#testing set
X_test=vectorizer.transform(question_test).toarray()
y_test=list(map(int, category_test))
vectoriser_output=[X_train,X_test,y_train,y_test]
return vectoriser_output
vectoriser_output =Vectoriser(question_train,question_test,category_train,category_test)
X_train=vectoriser_output[0]
X_test=vectoriser_output[1]
y_train=vectoriser_output[2]
y_test=vectoriser_output[3]

def SVCclassifier(X_train,X_test,y_train,y_test):
#linear SVC
clf=LinearSVC()
clf.fit(X_train,y_train)
#predicting the results

y_pred=list(map(int,clf.predict(X_test)))
return y_pred
y_pred=SVCclassifier(X_train,X_test,y_train,y_test)

def outputMetrics(y_pred,y_test):
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred,labels=[4,5,1,0,3,2]))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
outputMetrics(y_pred,y_test)
Loading