i am new in python and i want to use MrJob package for countind relative frequency of pair words i wrote below code but it doesn't make correct output. can you plz help me with my mistakes? š(š“|šµ) = ššš¢šš”(š“, šµ)/ššš¢šš”(šµ)=ššš¢šš”(š“, šµ)/āA' ššš¢šš”(š“ā² , šµ)
import re
from collections import defaultdict
from mrjob.job import MRJob
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
for wordpair in WORD_RE.findall(line):
if word != wordpair:
yield (word.lower(), wordpair.lower(), 1)
def reducer(self, key, values):
cnts = defaultdict(int)
total = 0
for (word, count) in values:
cnt=0
total += count
cnts[word] += count
for (k,kp), v in cnts.items():
yield (k,kp), (v, float(v) / total)
def combiner(self, key, values):
yield None, (key, sum(values))
if __name__ == '__main__':
MRRelativeFreq.run()
You will need an intermediate data structure, in this case a defaultdict
to count the total of times the word appears.
import re
from collections import defaultdict
from itertools import combinations
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
class MRRelativeFreq(MRJob):
denoms = defaultdict(int)
def steps(self):
return [
MRStep(
mapper=self.mapper,
combiner=self.combiner,
reducer=self.reducer),
MRStep(
reducer=self.reducer_s2)
]
def mapper(self, _, line):
words = WORD_RE.findall(line)
for (x, y) in combinations(words, 2):
if x != y:
yield ((x.lower(), "*"), 1)
yield ((x.lower(), y.lower()), 1)
def combiner(self, pair, counts):
yield (pair, sum(counts))
def reducer(self, pair, counts):
count = sum(counts)
x, y = pair
if y == "*":
self.denoms[x] = count
else:
yield ((x, y), count)
def reducer_s2(self, pair, ycnt):
x, y = pair
lkup = self.denoms[x]
yield (pair, round((sum(ycnt) / lkup), 2))
if __name__ == '__main__':
MRRelativeFreq.run()
Given a file of the below structure:
"""
this is something
this is not
or else this is
"""
I get the following results:
["or", "else"] 0.33
["or", "is"] 0.33
["or", "this"] 0.33
["this", "is"] 0.6
["this", "not"] 0.2
["this", "something"] 0.2
["is", "not"] 0.5
["is", "something"] 0.5
["else", "is"] 0.5
["else", "this"] 0.5
Used this as a hint.