I have the below code and get the word count but getting the first letter frequency of all the words I don't understand how to do this. If there are three words starting with C in the file I would expect the outcome to be "C 3". I don't need to distinguish between caps so 'a' and 'A' will be the counted the same.
from mrjob.job import MRJob
class Job(MRJob):
def mapper(self,Key, value):
for char in value.strip().split():
yield char, 1
def reducer(self, Key, values):
yield Key, sum(values)
if __name__ == '__main__':
Job.run()
You can change the default example on https://pypi.org/project/mrjob/:
"""The classic MapReduce job: count the frequency of words. """ from mrjob.job import MRJob import re WORD_RE = re.compile(r"[\w']+") class MRWordFreqCount(MRJob): def mapper(self, _, line): for word in WORD_RE.findall(line): yield (word.lower(), 1) def combiner(self, word, counts): yield (word, sum(counts)) def reducer(self, word, counts): yield (word, sum(counts))
which accomplishes this for full (lowercased) words into
"""The changed MapReduce job: count the frequency of words
starting with the same (case insensitive) letter."""
from mrjob.job import MRJob
import re
WORD_RE = re.compile(r"[\w']+")
class MyWordCount(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
yield (word[0].lower(), 1) # use the 1st letter, lowercased
def combiner(self, word, counts):
yield (word, sum(counts))
def reducer(self, word, counts):
yield (word, sum(counts))
if __name__ == '__main__':
MyWordCount.run()
save it as my_word_count.py
and start it like so:
python my_word_count README.rst > counts.txt
The result is then found in counts.txt