pythonmrjob

Write a job that counts the frequencies of word first letters in a file. So if there are three words starting with "c" answer would be "c 3"


I have the below code and get the word count but getting the first letter frequency of all the words I don't understand how to do this. If there are three words starting with C in the file I would expect the outcome to be "C 3". I don't need to distinguish between caps so 'a' and 'A' will be the counted the same.

from mrjob.job import MRJob

class Job(MRJob):
    def mapper(self,Key, value):
     
        for char in value.strip().split():
            yield char, 1
    def reducer(self, Key, values):
    
        yield Key, sum(values)
if __name__ == '__main__':
    Job.run()

Solution

  • You can change the default example on https://pypi.org/project/mrjob/:

    """The classic MapReduce job: count the frequency of words.
    """
    from mrjob.job import MRJob
    import re
    
    WORD_RE = re.compile(r"[\w']+")
    
    
    class MRWordFreqCount(MRJob):
    
        def mapper(self, _, line):
            for word in WORD_RE.findall(line):
                yield (word.lower(), 1)
    
        def combiner(self, word, counts):
            yield (word, sum(counts))
    
        def reducer(self, word, counts):
            yield (word, sum(counts))
    

    which accomplishes this for full (lowercased) words into

    """The changed MapReduce job: count the frequency of words
    starting with the same (case insensitive) letter."""
    from mrjob.job import MRJob
    import re
    
    WORD_RE = re.compile(r"[\w']+")    
    
    class MyWordCount(MRJob):
    
        def mapper(self, _, line):
            for word in WORD_RE.findall(line):
                yield (word[0].lower(), 1)      # use the 1st letter, lowercased
    
        def combiner(self, word, counts):
            yield (word, sum(counts))
    
        def reducer(self, word, counts):
            yield (word, sum(counts))
    
    
    if __name__ == '__main__':
         MyWordCount.run()
    

    save it as my_word_count.py and start it like so:

    python my_word_count README.rst > counts.txt

    The result is then found in counts.txt