I am trying to solve a question in hackerrank, which determine conditional frequency distribution of all the words(lowercase and removing stop words) for the given category 'cfdconditions', and events 'cfdevents'. Also compute conditional frequency distribution of category 'cfdconditions' and events ending with 'ing' or 'ed'. And then display frequency modal for both distributions.
My code is -
def calculateCFD(cfdconditions, cfdevents):
# Write your code here
from nltk.corpus import brown
from nltk import ConditionalFreqDist
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))
cdev_cfd = [ (genre, word.lower()) for genre in cfdconditions for word in brown.words(categories=genre) if word.lower() not in stopword]
cdev_cfd = [list(x) for x in cdev_cfd]
cdev_cfd = nltk.ConditionalFreqDist(cdev_cfd)
a = cdev_cfd.tabulate(condition = cfdconditions, samples = cfdevents)
inged_cfd = [ (genre, word.lower()) for genre in cfdconditions for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed')) ]
inged_cfd = [list(x) for x in inged_cfd]
for wd in inged_cfd:
if wd[1].endswith('ing') and wd[1] not in stopword:
wd[1] = 'ing'
elif wd[1].endswith('ed') and wd[1] not in stopword:
wd[1] = 'ed'
inged_cfd = nltk.ConditionalFreqDist(inged_cfd)
b = inged_cfd.tabulate(cfdconditions, samples = ['ed','ing'])
return(a,b)
But result is still failing for 2 test cases, for which my output is -
many years
adventure 24 32
fiction 29 44
science_fiction 11 16
ed ing
adventure 3281 1844
fiction 2943 1767
science_fiction 574 293
and
good bad better
adventure 39 9 30
fiction 60 17 27
mystery 45 13 29
science_fiction 14 1 4
ed ing
adventure 3281 1844
fiction 2943 1767
mystery 2382 1374
science_fiction 574 293
If anyone can help me for the solution, it will be of great help.
Try this code and see if it works.
from nltk.corpus import brown,stopwords
def calculateCFD(cfdconditions, cfdevents):
# Write your code here
stopword = set(stopwords.words('english'))
cdev_cfd = nltk.ConditionalFreqDist([(genre, word.lower()) for genre in brown.categories() for word in brown.words(categories=genre) if not word.lower() in stopword])
cdev_cfd.tabulate(conditions = cfdconditions, samples = cfdevents)
inged_cfd = [ (genre, word.lower()) for genre in brown.categories() for word in brown.words(categories=genre) if (word.lower().endswith('ing') or word.lower().endswith('ed')) ]
inged_cfd = [list(x) for x in inged_cfd]
for wd in inged_cfd:
if wd[1].endswith('ing') and wd[1] not in stopword:
wd[1] = 'ing'
elif wd[1].endswith('ed') and wd[1] not in stopword:
wd[1] = 'ed'
#print(inged_cfd)
inged_cfd = nltk.ConditionalFreqDist(inged_cfd)
#print(inged_cfd.conditions())
inged_cfd.tabulate(conditions=cfdconditions, samples = ['ed','ing'])