pythonmrjob

Python mrjob - Finding 10 longest words, but mrjob returns duplicate words


I am using Python mrjob to find the 10 longest words from a text file. I have obtained a result, but the result contains duplicate words. How do I obtain only unique words (ie. remove duplicate words)?

%%file most_chars.py  
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below


class MostChars(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                  reducer=self.reducer_find_longest_words)
        ]

    def mapper_get_words(self, _, line):
        for word in WORD_RE.findall(line):   
            yield None, (len(word), word.lower().strip())

    # discard the key; it is just None
    def reducer_find_longest_words(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word

        sorted_pair = sorted(word_count_pairs, reverse=True)
        
        for pair in sorted_pair[0:10]:
            yield pair
              
if __name__ == '__main__':
    MostChars.run()

Actual Output:

18  "overcapitalization"
18  "overcapitalization"
18  "overcapitalization"
17  "uncomprehendingly"
17  "misunderstandings"
17  "disinterestedness"
17  "disinterestedness"
17  "disinterestedness"
17  "disinterestedness"
17  "conventionalities"

Expected Output:

18  "overcapitalization"
17  "uncomprehendingly"
17  "misunderstandings"
17  "disinterestedness"
17  "conventionalities"

and 5 more unique words


Solution

  • Update reducer_find_longest_words to get only the unique elements. Note the use of list(set()).

        def reducer_find_longest_words(self, _, word_count_pairs):
            # each item of word_count_pairs is (count, word),
            # so yielding one results in key=counts, value=word
    
            unique_pairs = [list(x) for x in set(tuple(x) for x in word_count_pairs)]
            sorted_pair = sorted(unique_pairs, reverse=True)
            
            for pair in sorted_pair[0:10]:
                yield pair