pythonmapreducemrjob

counting relative frequency in pairs a strips mapreduce


i am new in python and i want to use MrJob package for countind relative frequency of pair words i wrote below code but it doesn't make correct output. can you plz help me with my mistakes? š‘“(š“|šµ) = š‘š‘œš‘¢š‘›š‘”(š“, šµ)/š‘š‘œš‘¢š‘›š‘”(šµ)=š‘š‘œš‘¢š‘›š‘”(š“, šµ)/āˆ‘A' š‘š‘œš‘¢š‘›š‘”(š“ā€² , šµ)

import re
from collections import defaultdict

from mrjob.job import MRJob

WORD_RE = re.compile(r"[\w']+")


class MRRelativeFreq(MRJob):
    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
          for wordpair in WORD_RE.findall(line):
            if word != wordpair:
               yield (word.lower(), wordpair.lower(), 1)

    def reducer(self, key, values):
        cnts = defaultdict(int)
        total = 0
        for (word, count) in values:
          cnt=0
          total += count
          cnts[word] += count

        for (k,kp), v in cnts.items():
            yield (k,kp), (v, float(v) / total) 

    def combiner(self, key, values):
        yield None, (key, sum(values))


if __name__ == '__main__':
    MRRelativeFreq.run()

Solution

  • You will need an intermediate data structure, in this case a defaultdict to count the total of times the word appears.

    import re
    from collections import defaultdict
    from itertools import combinations
    
    from mrjob.job import MRJob
    from mrjob.step import MRStep
    
    WORD_RE = re.compile(r"[\w']+")
    
    
    class MRRelativeFreq(MRJob):
        denoms = defaultdict(int)
    
        def steps(self):
            return [
                MRStep(
                    mapper=self.mapper,
                    combiner=self.combiner,
                    reducer=self.reducer),
                MRStep(
                    reducer=self.reducer_s2)
            ]
    
        def mapper(self, _, line):
            words = WORD_RE.findall(line)
            for (x, y) in combinations(words, 2):
                if x != y:
                    yield ((x.lower(), "*"), 1)
                    yield ((x.lower(), y.lower()), 1)
    
        def combiner(self, pair, counts):
            yield (pair, sum(counts))
    
        def reducer(self, pair, counts):
            count = sum(counts)
            x, y = pair
            if y == "*":
                self.denoms[x] = count
            else:
                yield ((x, y), count)
    
        def reducer_s2(self, pair, ycnt):
            x, y = pair
            lkup = self.denoms[x]
            yield (pair, round((sum(ycnt) / lkup), 2))
    
    if __name__ == '__main__':
        MRRelativeFreq.run()
    

    Given a file of the below structure:

    """
    this is something
    this is not
    or else this is
    """
    

    I get the following results:

    ["or", "else"]  0.33
    ["or", "is"]    0.33
    ["or", "this"]  0.33
    ["this", "is"]  0.6
    ["this", "not"] 0.2
    ["this", "something"]   0.2
    ["is", "not"]   0.5
    ["is", "something"] 0.5
    ["else", "is"]  0.5
    ["else", "this"]    0.5
    

    Used this as a hint.